Skip to content

[Store][ChromaDB] ChunkPadding or ContextWindow #1111

@xprojects-de

Description

@xprojects-de

I'm currently experimenting with Chromadb and embeddings and have noticed that individual chunks are often insufficient to understand the context. I get much better results when I extend the context window using chunk padding. During indexing, I store the chunk index in the metadata, and in the similarity search, I always read two chunks before and two after it.

What do you think about implementing this in the core?
I think it would be a great feature for extending the context window.

Example (The code was just quickly programmed and of course still needs optimization :-)):

<?php

declare(strict_types=1);

namespace Alpdesk\AlpdeskAi\Storage\Store\Transformer;

use Symfony\AI\Store\Document\Metadata;
use Symfony\AI\Store\Document\TextDocument;
use Symfony\AI\Store\Document\TransformerInterface;
use Symfony\AI\Store\Exception\InvalidArgumentException;
use Symfony\Component\Uid\Uuid;

class TextSplitTransformer implements TransformerInterface
{
    public const string OPTION_CHUNK_SIZE = 'chunk_size';
    public const string OPTION_OVERLAP = 'overlap';

    public function __construct(
        private readonly int $chunkSize = 1000,
        private readonly int $overlap = 200,
    )
    {
        if ($this->overlap < 0 || $this->overlap >= $this->chunkSize) {
            throw new InvalidArgumentException(\sprintf('Overlap must be non-negative and less than chunk size. Got chunk size: %d, overlap: %d.', $this->chunkSize, $this->overlap));
        }
    }

    /**
     * @param array{chunk_size?: int, overlap?: int} $options
     */
    public function transform(iterable $documents, array $options = []): iterable
    {
        $chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? $this->chunkSize;
        $overlap = $options[self::OPTION_OVERLAP] ?? $this->overlap;

        if ($overlap < 0 || $overlap >= $chunkSize) {
            throw new InvalidArgumentException('Overlap must be non-negative and less than chunk size.');
        }

        foreach ($documents as $document) {

            if (!$document instanceof TextDocument) {
                continue;
            }

            if (\mb_strlen($document->getContent()) <= $chunkSize) {

                yield $document;
                continue;

            }

            $text = $document->getContent();
            $length = \mb_strlen($text);
            $start = 0;

            $chunkIndex = 0;

            while ($start < $length) {

                $end = \min($start + $chunkSize, $length);
                $chunkText = \mb_substr($text, $start, $end - $start);

                yield new TextDocument(Uuid::v4(), $chunkText, new Metadata([
                    Metadata::KEY_PARENT_ID => $document->getId(),
                    Metadata::KEY_TEXT => $chunkText,
                    'chunk_index' => $chunkIndex,
                    ...$document->getMetadata(),
                ]));

                $start += ($chunkSize - $overlap);

                $chunkIndex++;

            }

        }

    }

}
<?php

declare(strict_types=1);

namespace Alpdesk\AlpdeskAi\Toolbox\Tool;

use Symfony\AI\Agent\Toolbox\Attribute\AsTool;
use Symfony\AI\Platform\Vector\Vector;
use Symfony\AI\Store\Document\VectorizerInterface;
use Symfony\AI\Store\StoreInterface;
use Symfony\AI\Store\Document\Metadata;

#[AsTool('documents_similarity_search', description: 'Searches for pdf documents similar to a query or sentence.')]
class SimilaritySearchDocuments
{
    public array $usedDocuments = [];

    public function __construct(
        private readonly VectorizerInterface $vectorizer,
        private readonly StoreInterface      $store,
        private readonly int                 $chunkPadding = 2
    )
    {
    }

    /**
     * @param string $searchTerm string used for similarity search
     */
    public function __invoke(string $searchTerm): string
    {
        $vector = $this->vectorizer->vectorize($searchTerm);
        $this->usedDocuments = \iterator_to_array($this->store->query($vector, ['include' => ['documents']]));

        if ([] === $this->usedDocuments) {
            return 'No results found';
        }

        $result = 'Found documents with following information:' . \PHP_EOL;
        foreach ($this->usedDocuments as $document) {

            try {

                $metaData = $document->metadata;

                $chunkIndex = $metaData['chunk_index'] ?? null;

                if ($chunkIndex !== null && $this->chunkPadding > 0) {

                    $minChunk = \max(0, (int)$chunkIndex - $this->chunkPadding);
                    $maxChunk = (int)$chunkIndex + $this->chunkPadding;
                    $documentVector = $document->vector;

                    for ($index = $minChunk; $index < $maxChunk; $index++) {

                        if ($index === (int)$chunkIndex) {
                            continue;
                        }

                        if ($documentVector instanceof Vector) {

                            $options = [
                                'where' => ['chunk_index' => $index],
                                'include' => ['documents'],
                            ];

                            $documents = \iterator_to_array($this->store->query($documentVector, $options));

                            $documentVector = $documents[0]->vector ?? null;

                            $chunkMetas = $documents[0]->metadata ?? null;
                            if ($chunkMetas instanceof Metadata) {

                                $text = $chunkMetas->getText();
                                if (\is_string($text) && $text !== '') {
                                    $metaData->setText(($metaData->getText() ?? '') . \PHP_EOL . '--- CHUNK SEPARATOR ---' . \PHP_EOL . \trim($text));
                                }

                            }

                        }

                    }

                }

                $result .= \json_encode($metaData, JSON_THROW_ON_ERROR);

            } catch (\Throwable) {
            }

        }

        return $result;

    }

}

Metadata

Metadata

Assignees

Labels

RFCRFC = Request For Comments (proposals about features that you want to be discussed)StoreIssues & PRs about the AI Store component

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions