2626MAX_WORKERS = int (os .environ .get ("CODE_MEMORY_MAX_WORKERS" , "4" ))
2727
2828# ── Directories to always skip (even without .gitignore) ───────────────
29- _SKIP_DIRS = frozenset ({
29+ SKIP_DIRS = frozenset ({
3030 ".venv" , "venv" , "__pycache__" , ".git" , "node_modules" ,
3131 ".mypy_cache" , ".pytest_cache" , ".ruff_cache" , ".tox" ,
3232 "dist" , "build" , "target" , "bin" , "obj" ,
@@ -330,122 +330,6 @@ def _walk(node: Node):
330330 return refs
331331
332332
333- # ---------------------------------------------------------------------------
334- # Single-file indexer
335- # ---------------------------------------------------------------------------
336-
337- def index_file (filepath : str , db ) -> dict :
338- """Parse a single source file and index its symbols + references.
339-
340- Optimized version using batch embeddings and transaction-based writes.
341-
342- Uses tree-sitter when a grammar is available for the file's language.
343- Falls back to indexing the whole file as a single symbol otherwise.
344- Skips the file if its ``last_modified`` timestamp has not changed.
345-
346- Args:
347- filepath: Absolute path to a source file.
348- db: An open ``sqlite3.Connection`` from ``db.get_db()``.
349-
350- Returns:
351- A dict with ``file``, ``symbols_indexed``, ``references_indexed``,
352- and ``skipped`` keys.
353- """
354- filepath = os .path .abspath (filepath )
355- ext = os .path .splitext (filepath )[1 ].lower ()
356-
357- # ── Check freshness ───────────────────────────────────────────────
358- mtime = os .path .getmtime (filepath )
359- row = db .execute (
360- "SELECT id, last_modified FROM files WHERE path = ?" , (filepath ,)
361- ).fetchone ()
362-
363- if row and row [1 ] >= mtime :
364- return {"file" : filepath , "symbols_indexed" : 0 ,
365- "references_indexed" : 0 , "skipped" : True }
366-
367- # ── Read file ─────────────────────────────────────────────────────
368- source_bytes = Path (filepath ).read_bytes ()
369- source_text = source_bytes .decode ("utf-8" , errors = "replace" )
370-
371- fhash = db_mod .file_hash (filepath ) # Now uses xxHash
372- file_id = db_mod .upsert_file (db , filepath , mtime , fhash )
373-
374- # Delete stale data before re-inserting
375- db_mod .delete_file_data (db , file_id )
376-
377- symbols_indexed = 0
378- references_indexed = 0
379-
380- # ── Try tree-sitter parsing ───────────────────────────────────────
381- lang = _load_language (ext )
382-
383- if lang is not None :
384- parser = Parser (lang )
385- tree = parser .parse (source_bytes )
386-
387- # Extract symbols
388- raw_symbols = _extract_symbols (tree .root_node , source_bytes )
389-
390- # === BATCH PROCESSING ===
391- all_embed_inputs = []
392- for sym in raw_symbols :
393- embed_input = f"{ sym ['kind' ]} { sym ['name' ]} : { sym ['source_text' ][:1000 ]} "
394- all_embed_inputs .append (embed_input )
395-
396- # Batch embed all at once
397- # Use code2code task_type for code content at index time.
398- # Query time uses nl2code (natural language -> code), so index time
399- # should use code2code (code -> code) to place vectors in the correct subspace.
400- if all_embed_inputs :
401- embeddings = db_mod .embed_texts_batch (all_embed_inputs , batch_size = 64 , task_type = "code2code" )
402-
403- # Store all in single transaction
404- db_ids = {}
405- with db_mod .transaction (db ):
406- for i , sym in enumerate (raw_symbols ):
407- parent_id = db_ids .get (sym ["parent_idx" ]) if sym ["parent_idx" ] is not None else None
408- sym_id = db_mod .upsert_symbol (
409- db , sym ["name" ], sym ["kind" ], file_id ,
410- sym ["line_start" ], sym ["line_end" ],
411- parent_id , sym ["source_text" ],
412- auto_commit = False
413- )
414- db_ids [i ] = sym_id
415- db_mod .upsert_embedding (db , sym_id , embeddings [i ], auto_commit = False )
416- symbols_indexed += 1
417-
418- # Extract and store references (also batched)
419- refs = _extract_references (tree .root_node , source_bytes )
420- if refs :
421- with db_mod .transaction (db ):
422- for ref in refs :
423- db_mod .upsert_reference (db , ref ["name" ], file_id , ref ["line" ], auto_commit = False )
424- references_indexed += 1
425-
426- else :
427- # ── Fallback: index entire file as one symbol ─────────────────
428- basename = os .path .basename (filepath )
429- embeddings = db_mod .embed_texts_batch ([f"file { basename } : { source_text [:1000 ]} " ], task_type = "code2code" )
430-
431- with db_mod .transaction (db ):
432- sym_id = db_mod .upsert_symbol (
433- db , basename , "file" , file_id ,
434- 1 , source_text .count ("\n " ) + 1 ,
435- None , source_text [:5000 ],
436- auto_commit = False
437- )
438- db_mod .upsert_embedding (db , sym_id , embeddings [0 ], auto_commit = False )
439- symbols_indexed += 1
440-
441- return {
442- "file" : filepath ,
443- "symbols_indexed" : symbols_indexed ,
444- "references_indexed" : references_indexed ,
445- "skipped" : False ,
446- }
447-
448-
449333# ---------------------------------------------------------------------------
450334# Directory indexer
451335# ---------------------------------------------------------------------------
@@ -457,7 +341,7 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
457341 embedding generation sequential (sentence-transformers releases GIL during
458342 inference). Processes files in batches for embedding efficiency.
459343
460- Skips directories in ``_SKIP_DIRS ``, files matching ``.gitignore`` patterns
344+ Skips directories in ``SKIP_DIRS ``, files matching ``.gitignore`` patterns
461345 (including nested .gitignore files), and unchanged files. Indexes any file
462346 with a recognised source-code extension.
463347
@@ -467,7 +351,8 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
467351 progress_callback: Optional callback(current, total, message) for progress updates.
468352
469353 Returns:
470- A list of per-file result dicts (see :func:`index_file`).
354+ A list of per-file result dicts (see :func:`_parse_file_for_indexing`
355+ and :func:`_store_parsed_file`).
471356 """
472357 import time
473358
@@ -486,7 +371,7 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
486371 rel_root = os .path .relpath (root , dirpath )
487372 if rel_root != "." :
488373 gitignore .check_dir_for_gitignore (root , rel_root )
489- dirs [:] = [d for d in dirs if d not in _SKIP_DIRS and not d .endswith (".egg-info" )
374+ dirs [:] = [d for d in dirs if d not in SKIP_DIRS and not d .endswith (".egg-info" )
490375 and not gitignore .should_skip (os .path .join (rel_root , d ) if rel_root != "." else d , is_dir = True )]
491376 for fname in sorted (files ):
492377 rel_path = os .path .join (rel_root , fname ) if rel_root != "." else fname
@@ -593,6 +478,20 @@ def _parse_file_task(fpath: str) -> tuple[str, dict | None, Exception | None]:
593478 file_result = _store_parsed_file (fpath , parsed_data , db , file_embeddings )
594479 results .append (file_result )
595480
481+ # Phase 4: Clean up stale files (deleted from disk but still in index)
482+ stale_count = 0
483+ rows = db .execute ("SELECT id, path FROM files" ).fetchall ()
484+ for file_id , path in rows :
485+ if not path .startswith (dirpath + os .sep ) and path != dirpath :
486+ continue
487+ if not os .path .exists (path ):
488+ db_mod .delete_file_data (db , file_id )
489+ db .execute ("DELETE FROM files WHERE id = ?" , (file_id ,))
490+ db .commit ()
491+ stale_count += 1
492+ if stale_count :
493+ logger .info ("Cleaned up %d stale file(s) no longer on disk" , stale_count )
494+
596495 # Log performance summary
597496 total_elapsed = time .perf_counter () - total_start
598497 total_symbols = sum (r .get ("symbols_indexed" , 0 ) for r in results )
0 commit comments