Skip to content

Commit 097a8ab

Browse files
Copilotdevlux76
andcommitted
feat(#90): integrate HierarchyBuilder into ingestion pipeline
ingestText() now calls buildHierarchy() after page creation to produce the full Books→Volumes→Shelves hierarchy on every ingest call. IngestResult extended with books[], volumes[], shelves[] fields. Closes #90 Co-authored-by: devlux76 <[email protected]>
1 parent e6b44bf commit 097a8ab

File tree

3 files changed

+80
-83
lines changed

3 files changed

+80
-83
lines changed

lib/hippocampus/Ingest.ts

Lines changed: 26 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
import type { Book, MetadataStore, VectorStore } from "../core/types";
1+
import type { Book, MetadataStore, Volume, Shelf, VectorStore } from "../core/types";
22
import type { ModelProfile } from "../core/ModelProfile";
33
import { hashText } from "../core/crypto/hash";
44
import type { KeyPair } from "../core/crypto/sign";
55
import { EmbeddingRunner } from "../embeddings/EmbeddingRunner";
66
import { chunkText } from "./Chunker";
77
import { buildPage } from "./PageBuilder";
8-
import { runPromotionSweep } from "../core/SalienceEngine";
98
import { insertSemanticNeighbors } from "./FastNeighborInsert";
9+
import { buildHierarchy } from "./HierarchyBuilder";
1010

1111
export interface IngestOptions {
1212
modelProfile: ModelProfile;
@@ -19,46 +19,15 @@ export interface IngestOptions {
1919

2020
export interface IngestResult {
2121
pages: Array<Awaited<ReturnType<typeof buildPage>>>;
22-
/** The single Book representing everything ingested by this call.
23-
* One ingest call = one Book, always. All pages are members.
24-
* A collection of Books becomes a Volume; a collection of Volumes
25-
* becomes a Shelfthose tiers are assembled by the Daydreamer. */
22+
/** All Books produced by this ingest call. The hierarchy builder chunks
23+
* pages into books of up to PAGES_PER_BOOK and computes a medoid for each. */
24+
books: Book[];
25+
/** Convenience alias for `books[0]`undefined when no pages were ingested. */
2626
book?: Book;
27-
}
28-
29-
function cosineDistance(a: Float32Array, b: Float32Array): number {
30-
let dot = 0;
31-
let normA = 0;
32-
let normB = 0;
33-
for (let i = 0; i < a.length; i++) {
34-
dot += a[i] * b[i];
35-
normA += a[i] * a[i];
36-
normB += b[i] * b[i];
37-
}
38-
const denom = Math.sqrt(normA) * Math.sqrt(normB);
39-
if (denom === 0) return 0;
40-
return 1 - dot / denom;
41-
}
42-
43-
/**
44-
* Selects the index of the medoid: the element that minimises total cosine
45-
* distance to every other element in the set.
46-
*/
47-
function selectMedoidIndex(vectors: Float32Array[]): number {
48-
if (vectors.length === 1) return 0;
49-
let bestIdx = 0;
50-
let bestTotal = Infinity;
51-
for (let i = 0; i < vectors.length; i++) {
52-
let total = 0;
53-
for (let j = 0; j < vectors.length; j++) {
54-
if (i !== j) total += cosineDistance(vectors[i], vectors[j]);
55-
}
56-
if (total < bestTotal) {
57-
bestTotal = total;
58-
bestIdx = i;
59-
}
60-
}
61-
return bestIdx;
27+
/** Volumes produced by grouping books during hierarchy construction. */
28+
volumes: Volume[];
29+
/** Shelves produced by grouping volumes during hierarchy construction. */
30+
shelves: Shelf[];
6231
}
6332

6433
export async function ingestText(
@@ -76,7 +45,7 @@ export async function ingestText(
7645

7746
const chunks = chunkText(text, modelProfile);
7847
if (chunks.length === 0) {
79-
return { pages: [], book: undefined };
48+
return { pages: [], books: [], book: undefined, volumes: [], shelves: [] };
8049
}
8150

8251
const createdAt = new Date(now).toISOString();
@@ -124,23 +93,7 @@ export async function ingestText(
12493
});
12594
}
12695

127-
// Build ONE Book for the entire ingest.
128-
// A Book = the document we just ingested; its identity is the sorted set of
129-
// its pages. Its representative is the page whose embedding is the medoid
130-
// (minimum total cosine distance to all other pages in the document).
131-
const medoidIdx = selectMedoidIndex(embeddings);
132-
const sortedPageIds = [...pageIds].sort();
133-
const bookId = await hashText(sortedPageIds.join("|"));
134-
const book: Book = {
135-
bookId,
136-
pageIds,
137-
medoidPageId: pageIds[medoidIdx],
138-
meta: {},
139-
};
140-
await metadataStore.putBook(book);
141-
14296
// Insert semantic neighbor edges for the new pages against all stored pages.
143-
// Volumes and Shelves are assembled by the Daydreamer from accumulated Books.
14497
const allPages = await metadataStore.getAllPages();
14598
const allPageIds = allPages.map((p) => p.pageId);
14699
await insertSemanticNeighbors(pageIds, allPageIds, {
@@ -149,8 +102,20 @@ export async function ingestText(
149102
metadataStore,
150103
});
151104

152-
// Run hotpath promotion for the newly ingested pages and book.
153-
await runPromotionSweep([...pageIds, bookId], metadataStore);
105+
// Build the full hierarchy: Pages → Books → Volumes → Shelves.
106+
// buildHierarchy handles medoid selection, adjacency edges, prototype
107+
// computation, Williams fanout enforcement, and promotion sweeps.
108+
const hierarchy = await buildHierarchy(pageIds, {
109+
modelProfile,
110+
vectorStore,
111+
metadataStore,
112+
});
154113

155-
return { pages, book };
114+
return {
115+
pages,
116+
books: hierarchy.books,
117+
book: hierarchy.books[0],
118+
volumes: hierarchy.volumes,
119+
shelves: hierarchy.shelves,
120+
};
156121
}

tests/hippocampus/Ingest.test.ts

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,21 @@ describe("hippocampus ingest", () => {
6767
lastQueryAt: result.pages[0].createdAt,
6868
});
6969

70-
// Book should contain the pages
70+
// Book should contain some of the pages (hierarchy builder chunks by PAGES_PER_BOOK)
71+
expect(result.book).toBeDefined();
7172
const storedBook = await metadataStore.getBook(result.book!.bookId);
7273
expect(storedBook).toEqual(result.book);
7374

75+
// All pages should be covered by the books
76+
const allBookPageIds = result.books.flatMap((b) => b.pageIds);
77+
for (const page of result.pages) {
78+
expect(allBookPageIds).toContain(page.pageId);
79+
}
80+
81+
// Volumes and shelves should be produced
82+
expect(result.volumes.length).toBeGreaterThanOrEqual(1);
83+
expect(result.shelves.length).toBeGreaterThanOrEqual(1);
84+
7485
// Vector store should have data stored for each page
7586
expect(vectorStore.byteLength).toBeGreaterThan(0);
7687
});

tests/integration/IngestQuery.test.ts

Lines changed: 42 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -234,10 +234,12 @@ describe("integration: ingest and query", () => {
234234
expect(stored!.embeddingDim).toBe(EMBEDDING_DIM);
235235
}
236236

237-
// Book should reference all page IDs
238-
const book = await metadataStore.getBook(result.book!.bookId);
239-
expect(book).toBeDefined();
240-
expect(book!.pageIds).toEqual(result.pages.map((p) => p.pageId));
237+
// Books should collectively reference all page IDs
238+
expect(result.books.length).toBeGreaterThanOrEqual(1);
239+
const allBookPageIds = result.books.flatMap((b) => b.pageIds);
240+
for (const page of result.pages) {
241+
expect(allBookPageIds).toContain(page.pageId);
242+
}
241243

242244
// Activity records should be initialized for each page
243245
for (const page of result.pages) {
@@ -278,7 +280,7 @@ describe("integration: ingest and query", () => {
278280
});
279281

280282
const ingestedPageIds = result.pages.map((p) => p.pageId);
281-
const bookId = result.book!.bookId;
283+
const bookIds = result.books.map((b) => b.bookId);
282284

283285
// ---- Session 2: Reopen the same database and verify persistence ----
284286

@@ -291,10 +293,11 @@ describe("integration: ingest and query", () => {
291293
expect(page!.pageId).toBe(pageId);
292294
}
293295

294-
// Book should still be there
295-
const book = await store2.getBook(bookId);
296-
expect(book).toBeDefined();
297-
expect(book!.pageIds).toEqual(ingestedPageIds);
296+
// Books should still be there
297+
for (const bookId of bookIds) {
298+
const book = await store2.getBook(bookId);
299+
expect(book).toBeDefined();
300+
}
298301

299302
// Activity records should survive
300303
for (const pageId of ingestedPageIds) {
@@ -398,7 +401,7 @@ describe("integration (v0.5): hierarchical and dialectical ingest/query", () =>
398401
(globalThis as Record<string, unknown>)["IDBKeyRange"] = FakeIDBKeyRange;
399402
});
400403

401-
it("ingest produces a single Book containing all ingested pages", async () => {
404+
it("ingest produces Books, Volumes, and Shelves via HierarchyBuilder", async () => {
402405
const dbName = freshDbName();
403406
const metadataStore = await IndexedDbMetadataStore.open(dbName);
404407
const vectorStore = new MemoryVectorStore();
@@ -417,21 +420,39 @@ describe("integration (v0.5): hierarchical and dialectical ingest/query", () =>
417420
// Pages were created
418421
expect(result.pages.length).toBeGreaterThanOrEqual(1);
419422

420-
// Exactly one Book was created and it contains ALL ingested pages
423+
// At least one Book was created
424+
expect(result.books.length).toBeGreaterThanOrEqual(1);
421425
expect(result.book).toBeDefined();
422-
const storedBook = await metadataStore.getBook(result.book!.bookId);
423-
expect(storedBook).toBeDefined();
424-
expect(storedBook!.medoidPageId).toBeDefined();
425-
expect(storedBook!.pageIds).toContain(storedBook!.medoidPageId);
426-
// Every page from the ingest must be a member of the book
426+
427+
// Every page must belong to exactly one book
428+
const allBookPageIds = result.books.flatMap((b) => b.pageIds);
427429
for (const page of result.pages) {
428-
expect(storedBook!.pageIds).toContain(page.pageId);
430+
expect(allBookPageIds).toContain(page.pageId);
431+
}
432+
// Every book's medoid must be one of its own pages
433+
for (const book of result.books) {
434+
const storedBook = await metadataStore.getBook(book.bookId);
435+
expect(storedBook).toBeDefined();
436+
expect(storedBook!.medoidPageId).toBeDefined();
437+
expect(storedBook!.pageIds).toContain(storedBook!.medoidPageId);
429438
}
430-
// The book covers all pages — not just a subset
431-
expect(storedBook!.pageIds.length).toBe(result.pages.length);
432439

433-
// Volumes and Shelves are assembled by the Daydreamer; not created at ingest time
434-
expect(result.book).toBeDefined(); // only book is returned
440+
// Volumes and Shelves are now produced during ingest via HierarchyBuilder
441+
expect(result.volumes.length).toBeGreaterThanOrEqual(1);
442+
expect(result.shelves.length).toBeGreaterThanOrEqual(1);
443+
444+
// Each volume should be persisted
445+
for (const vol of result.volumes) {
446+
const stored = await metadataStore.getVolume(vol.volumeId);
447+
expect(stored).toBeDefined();
448+
expect(stored!.bookIds.length).toBeGreaterThan(0);
449+
}
450+
// Each shelf should be persisted
451+
for (const shelf of result.shelves) {
452+
const stored = await metadataStore.getShelf(shelf.shelfId);
453+
expect(stored).toBeDefined();
454+
expect(stored!.volumeIds.length).toBeGreaterThan(0);
455+
}
435456
});
436457

437458
it("hotpath entries exist for hierarchy prototypes after ingest", async () => {

0 commit comments

Comments
 (0)