Genkit-UI/src/flows/indexDocumentsFlow.ts at main · ssdeanx/Genkit-UI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import { ai } from '../config.js';
import { z } from 'genkit';
import { readFile } from 'fs/promises';
import path from 'path';
import { chunk } from 'llm-chunk';
import { Document } from 'genkit/retriever';
import { devLocalIndexerRef } from '@genkit-ai/dev-local-vectorstore';
import { VECTORSTORE_INDEX } from '../config.js';

const chunkingConfig: Record<string, number | string> = {
  minLength: 1000,
  maxLength: 2000,
  splitter: 'sentence',
  overlap: 100,
  delimiters: '',
};

const indexer = devLocalIndexerRef(VECTORSTORE_INDEX);

const IndexInputSchema = z
  .object({
    filePath: z.string().optional(),
    text: z.string().optional(),
    sourceId: z.string().optional(),
    metadata: z.record(z.any()).optional(),
  })
  .describe('Ingest text or PDF into the configured vector index');

export const indexDocumentsFlow = ai.defineFlow(
  {
    name: 'indexDocuments',
    inputSchema: IndexInputSchema,
    outputSchema: z.object({ success: z.boolean(), documentsIndexed: z.number(), error: z.string().optional() }),
  },
  async (input: z.infer<typeof IndexInputSchema>) => {
    try {
      const { filePath, text, sourceId, metadata } = input;
      let content = typeof text === 'string' ? text : '';
      if (!content && typeof filePath === 'string' && filePath.trim().length > 0) {
        const resolved = path.resolve(filePath);
        const buffer = await readFile(resolved);
        // Dynamic import to avoid loading pdf-parse test data at module load time
        const { default: pdf } = await import('pdf-parse');
        const data = await pdf(buffer);
        content = typeof (data.text) === 'string' ? data.text : '';
      }

      if (!(typeof content === 'string') || content.trim().length === 0) {
        return { success: false, documentsIndexed: 0, error: 'No text to index' };
      }

      const chunksRes = await ai.run('chunk-it', async () => chunk(content, chunkingConfig));
      const chunks = toStringArray(chunksRes);

      const documents = chunks.map((c, idx) =>
        Document.fromText(c, { sourceId: sourceId ?? `doc-${Date.now()}-${idx}`, metadata: metadata ?? {} })
      );

      await ai.index({ indexer, documents });

      return { success: true, documentsIndexed: documents.length };
    } catch (err) {
      return { success: false, documentsIndexed: 0, error: err instanceof Error ? err.message : String(err) };
    }
  }
);

function toStringArray(value: unknown): string[] {
  if (Array.isArray(value)) {
    return value.filter((v): v is string => typeof v === 'string');
  }
  if (value === undefined || value === null) {
    return [];
  }
  return [String(value)];
}