-
-
Notifications
You must be signed in to change notification settings - Fork 143
Open
Labels
RFCRFC = Request For Comments (proposals about features that you want to be discussed)RFC = Request For Comments (proposals about features that you want to be discussed)StoreIssues & PRs about the AI Store componentIssues & PRs about the AI Store component
Description
I'm currently experimenting with Chromadb and embeddings and have noticed that individual chunks are often insufficient to understand the context. I get much better results when I extend the context window using chunk padding. During indexing, I store the chunk index in the metadata, and in the similarity search, I always read two chunks before and two after it.
What do you think about implementing this in the core?
I think it would be a great feature for extending the context window.
Example (The code was just quickly programmed and of course still needs optimization :-)):
<?php
declare(strict_types=1);
namespace Alpdesk\AlpdeskAi\Storage\Store\Transformer;
use Symfony\AI\Store\Document\Metadata;
use Symfony\AI\Store\Document\TextDocument;
use Symfony\AI\Store\Document\TransformerInterface;
use Symfony\AI\Store\Exception\InvalidArgumentException;
use Symfony\Component\Uid\Uuid;
class TextSplitTransformer implements TransformerInterface
{
public const string OPTION_CHUNK_SIZE = 'chunk_size';
public const string OPTION_OVERLAP = 'overlap';
public function __construct(
private readonly int $chunkSize = 1000,
private readonly int $overlap = 200,
)
{
if ($this->overlap < 0 || $this->overlap >= $this->chunkSize) {
throw new InvalidArgumentException(\sprintf('Overlap must be non-negative and less than chunk size. Got chunk size: %d, overlap: %d.', $this->chunkSize, $this->overlap));
}
}
/**
* @param array{chunk_size?: int, overlap?: int} $options
*/
public function transform(iterable $documents, array $options = []): iterable
{
$chunkSize = $options[self::OPTION_CHUNK_SIZE] ?? $this->chunkSize;
$overlap = $options[self::OPTION_OVERLAP] ?? $this->overlap;
if ($overlap < 0 || $overlap >= $chunkSize) {
throw new InvalidArgumentException('Overlap must be non-negative and less than chunk size.');
}
foreach ($documents as $document) {
if (!$document instanceof TextDocument) {
continue;
}
if (\mb_strlen($document->getContent()) <= $chunkSize) {
yield $document;
continue;
}
$text = $document->getContent();
$length = \mb_strlen($text);
$start = 0;
$chunkIndex = 0;
while ($start < $length) {
$end = \min($start + $chunkSize, $length);
$chunkText = \mb_substr($text, $start, $end - $start);
yield new TextDocument(Uuid::v4(), $chunkText, new Metadata([
Metadata::KEY_PARENT_ID => $document->getId(),
Metadata::KEY_TEXT => $chunkText,
'chunk_index' => $chunkIndex,
...$document->getMetadata(),
]));
$start += ($chunkSize - $overlap);
$chunkIndex++;
}
}
}
}<?php
declare(strict_types=1);
namespace Alpdesk\AlpdeskAi\Toolbox\Tool;
use Symfony\AI\Agent\Toolbox\Attribute\AsTool;
use Symfony\AI\Platform\Vector\Vector;
use Symfony\AI\Store\Document\VectorizerInterface;
use Symfony\AI\Store\StoreInterface;
use Symfony\AI\Store\Document\Metadata;
#[AsTool('documents_similarity_search', description: 'Searches for pdf documents similar to a query or sentence.')]
class SimilaritySearchDocuments
{
public array $usedDocuments = [];
public function __construct(
private readonly VectorizerInterface $vectorizer,
private readonly StoreInterface $store,
private readonly int $chunkPadding = 2
)
{
}
/**
* @param string $searchTerm string used for similarity search
*/
public function __invoke(string $searchTerm): string
{
$vector = $this->vectorizer->vectorize($searchTerm);
$this->usedDocuments = \iterator_to_array($this->store->query($vector, ['include' => ['documents']]));
if ([] === $this->usedDocuments) {
return 'No results found';
}
$result = 'Found documents with following information:' . \PHP_EOL;
foreach ($this->usedDocuments as $document) {
try {
$metaData = $document->metadata;
$chunkIndex = $metaData['chunk_index'] ?? null;
if ($chunkIndex !== null && $this->chunkPadding > 0) {
$minChunk = \max(0, (int)$chunkIndex - $this->chunkPadding);
$maxChunk = (int)$chunkIndex + $this->chunkPadding;
$documentVector = $document->vector;
for ($index = $minChunk; $index < $maxChunk; $index++) {
if ($index === (int)$chunkIndex) {
continue;
}
if ($documentVector instanceof Vector) {
$options = [
'where' => ['chunk_index' => $index],
'include' => ['documents'],
];
$documents = \iterator_to_array($this->store->query($documentVector, $options));
$documentVector = $documents[0]->vector ?? null;
$chunkMetas = $documents[0]->metadata ?? null;
if ($chunkMetas instanceof Metadata) {
$text = $chunkMetas->getText();
if (\is_string($text) && $text !== '') {
$metaData->setText(($metaData->getText() ?? '') . \PHP_EOL . '--- CHUNK SEPARATOR ---' . \PHP_EOL . \trim($text));
}
}
}
}
}
$result .= \json_encode($metaData, JSON_THROW_ON_ERROR);
} catch (\Throwable) {
}
}
return $result;
}
}Metadata
Metadata
Assignees
Labels
RFCRFC = Request For Comments (proposals about features that you want to be discussed)RFC = Request For Comments (proposals about features that you want to be discussed)StoreIssues & PRs about the AI Store componentIssues & PRs about the AI Store component