-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathindexDocumentsFlow.ts
More file actions
76 lines (66 loc) · 2.59 KB
/
indexDocumentsFlow.ts
File metadata and controls
76 lines (66 loc) · 2.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import { ai } from '../config.js';
import { z } from 'genkit';
import { readFile } from 'fs/promises';
import path from 'path';
import { chunk } from 'llm-chunk';
import { Document } from 'genkit/retriever';
import { devLocalIndexerRef } from '@genkit-ai/dev-local-vectorstore';
import { VECTORSTORE_INDEX } from '../config.js';
const chunkingConfig: Record<string, number | string> = {
minLength: 1000,
maxLength: 2000,
splitter: 'sentence',
overlap: 100,
delimiters: '',
};
const indexer = devLocalIndexerRef(VECTORSTORE_INDEX);
const IndexInputSchema = z
.object({
filePath: z.string().optional(),
text: z.string().optional(),
sourceId: z.string().optional(),
metadata: z.record(z.any()).optional(),
})
.describe('Ingest text or PDF into the configured vector index');
export const indexDocumentsFlow = ai.defineFlow(
{
name: 'indexDocuments',
inputSchema: IndexInputSchema,
outputSchema: z.object({ success: z.boolean(), documentsIndexed: z.number(), error: z.string().optional() }),
},
async (input: z.infer<typeof IndexInputSchema>) => {
try {
const { filePath, text, sourceId, metadata } = input;
let content = typeof text === 'string' ? text : '';
if (!content && typeof filePath === 'string' && filePath.trim().length > 0) {
const resolved = path.resolve(filePath);
const buffer = await readFile(resolved);
// Dynamic import to avoid loading pdf-parse test data at module load time
const { default: pdf } = await import('pdf-parse');
const data = await pdf(buffer);
content = typeof (data.text) === 'string' ? data.text : '';
}
if (!(typeof content === 'string') || content.trim().length === 0) {
return { success: false, documentsIndexed: 0, error: 'No text to index' };
}
const chunksRes = await ai.run('chunk-it', async () => chunk(content, chunkingConfig));
const chunks = toStringArray(chunksRes);
const documents = chunks.map((c, idx) =>
Document.fromText(c, { sourceId: sourceId ?? `doc-${Date.now()}-${idx}`, metadata: metadata ?? {} })
);
await ai.index({ indexer, documents });
return { success: true, documentsIndexed: documents.length };
} catch (err) {
return { success: false, documentsIndexed: 0, error: err instanceof Error ? err.message : String(err) };
}
}
);
function toStringArray(value: unknown): string[] {
if (Array.isArray(value)) {
return value.filter((v): v is string => typeof v === 'string');
}
if (value === undefined || value === null) {
return [];
}
return [String(value)];
}