Fix 6 code quality issues: dead code, missing IDs, perf, cleanup

kapillamba4 · claude · kapillamba4 · commit e0843963d3f8 · 2026-03-13T20:56:32.000+05:30
- Remove dead `index_file()` function (replaced by `_parse_file_for_indexing` + `_store_parsed_file`)
- Add `symbol_id` to `hybrid_search` results so `find_definition` doesn't need ambiguous fallback lookup
- Cache file contents in `find_references()` to avoid re-reading the same file per reference
- Add LRU cache (128 entries) on `embed_text()` to skip redundant model calls for repeated queries
- Unify duplicate `SKIP_DIRS` into a single constant in `parser.py`, imported by `doc_parser.py`
- Add stale file cleanup pass in both `index_directory()` and `index_doc_directory()` to remove ghost entries for deleted files

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/db.py b/db.py
@@ -16,6 +16,7 @@
 import sqlite3
 import sys
 from contextlib import contextmanager
+from functools import lru_cache
 from typing import TYPE_CHECKING
 
 import sqlite_vec
@@ -150,19 +151,26 @@ def get_embedding_dim() -> int:
     return _embedding_dim
 
 
+@lru_cache(maxsize=128)
+def _embed_text_cached(text: str, task_type: str) -> tuple[float, ...]:
+    """Cached embedding computation. Returns tuple for hashability."""
+    model = get_embedding_model()
+    prefixed_text = f"{task_type}: {text}"
+    vec = model.encode(prefixed_text, normalize_embeddings=True, show_progress_bar=False)
+    return tuple(vec.tolist())
+
+
 def embed_text(text: str, task_type: str = "nl2code") -> list[float]:
     """Generate a dense vector embedding for *text*.
 
     Uses jina-code-embeddings with task prefix for better code retrieval.
+    Results are cached (LRU, 128 entries) to avoid redundant model calls.
 
     Args:
         text: The text to embed.
         task_type: One of 'nl2code', 'code2code', 'code2nl', 'code2completion', 'qa'.
     """
-    model = get_embedding_model()
-    prefixed_text = f"{task_type}: {text}"
-    vec = model.encode(prefixed_text, normalize_embeddings=True, show_progress_bar=False)
-    return vec.tolist()
+    return list(_embed_text_cached(text, task_type))
 
 
 def embed_texts_batch(
diff --git a/doc_parser.py b/doc_parser.py
@@ -7,32 +7,21 @@
 
 from __future__ import annotations
 
+import logging
 import os
 import re
 
 from markdown_it import MarkdownIt
 
 import db as db_mod
-from parser import GitignoreMatcher
+from parser import SKIP_DIRS, GitignoreMatcher
+
+logger = logging.getLogger(__name__)
 
 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------
 
-SKIP_DIRS = {
-    ".venv",
-    "venv",
-    "__pycache__",
-    ".git",
-    "node_modules",
-    "build",
-    "dist",
-    "target",
-    ".mypy_cache",
-    ".pytest_cache",
-    ".ruff_cache",
-}
-
 DOC_EXTENSIONS = {".md", ".markdown"}
 README_PATTERN = re.compile(r"^readme(\.md|\.markdown|\.txt)?$", re.IGNORECASE)
 
@@ -393,6 +382,20 @@ def index_doc_directory(dirpath: str, db, progress_callback=None, progress_offse
             current = progress_offset + i + 1
             progress_callback(current, progress_total, f"Indexing docs: {os.path.basename(filepath)}")
 
+    # Clean up stale doc files (deleted from disk but still in index)
+    stale_count = 0
+    rows = db.execute("SELECT id, path FROM doc_files").fetchall()
+    for doc_file_id, path in rows:
+        if not path.startswith(abs_dir + os.sep) and path != abs_dir:
+            continue
+        if not os.path.exists(path):
+            db_mod.delete_doc_file_data(db, doc_file_id)
+            db.execute("DELETE FROM doc_files WHERE id = ?", (doc_file_id,))
+            db.commit()
+            stale_count += 1
+    if stale_count:
+        logger.info("Cleaned up %d stale doc file(s) no longer on disk", stale_count)
+
     return results
 
 
diff --git a/parser.py b/parser.py
@@ -26,7 +26,7 @@
 MAX_WORKERS = int(os.environ.get("CODE_MEMORY_MAX_WORKERS", "4"))
 
 # ── Directories to always skip (even without .gitignore) ───────────────
-_SKIP_DIRS = frozenset({
+SKIP_DIRS = frozenset({
     ".venv", "venv", "__pycache__", ".git", "node_modules",
     ".mypy_cache", ".pytest_cache", ".ruff_cache", ".tox",
     "dist", "build", "target", "bin", "obj",
@@ -330,122 +330,6 @@ def _walk(node: Node):
     return refs
 
 
-# ---------------------------------------------------------------------------
-# Single-file indexer
-# ---------------------------------------------------------------------------
-
-def index_file(filepath: str, db) -> dict:
-    """Parse a single source file and index its symbols + references.
-
-    Optimized version using batch embeddings and transaction-based writes.
-
-    Uses tree-sitter when a grammar is available for the file's language.
-    Falls back to indexing the whole file as a single symbol otherwise.
-    Skips the file if its ``last_modified`` timestamp has not changed.
-
-    Args:
-        filepath: Absolute path to a source file.
-        db: An open ``sqlite3.Connection`` from ``db.get_db()``.
-
-    Returns:
-        A dict with ``file``, ``symbols_indexed``, ``references_indexed``,
-        and ``skipped`` keys.
-    """
-    filepath = os.path.abspath(filepath)
-    ext = os.path.splitext(filepath)[1].lower()
-
-    # ── Check freshness ───────────────────────────────────────────────
-    mtime = os.path.getmtime(filepath)
-    row = db.execute(
-        "SELECT id, last_modified FROM files WHERE path = ?", (filepath,)
-    ).fetchone()
-
-    if row and row[1] >= mtime:
-        return {"file": filepath, "symbols_indexed": 0,
-                "references_indexed": 0, "skipped": True}
-
-    # ── Read file ─────────────────────────────────────────────────────
-    source_bytes = Path(filepath).read_bytes()
-    source_text = source_bytes.decode("utf-8", errors="replace")
-
-    fhash = db_mod.file_hash(filepath)  # Now uses xxHash
-    file_id = db_mod.upsert_file(db, filepath, mtime, fhash)
-
-    # Delete stale data before re-inserting
-    db_mod.delete_file_data(db, file_id)
-
-    symbols_indexed = 0
-    references_indexed = 0
-
-    # ── Try tree-sitter parsing ───────────────────────────────────────
-    lang = _load_language(ext)
-
-    if lang is not None:
-        parser = Parser(lang)
-        tree = parser.parse(source_bytes)
-
-        # Extract symbols
-        raw_symbols = _extract_symbols(tree.root_node, source_bytes)
-
-        # === BATCH PROCESSING ===
-        all_embed_inputs = []
-        for sym in raw_symbols:
-            embed_input = f"{sym['kind']} {sym['name']}: {sym['source_text'][:1000]}"
-            all_embed_inputs.append(embed_input)
-
-        # Batch embed all at once
-        # Use code2code task_type for code content at index time.
-        # Query time uses nl2code (natural language -> code), so index time
-        # should use code2code (code -> code) to place vectors in the correct subspace.
-        if all_embed_inputs:
-            embeddings = db_mod.embed_texts_batch(all_embed_inputs, batch_size=64, task_type="code2code")
-
-            # Store all in single transaction
-            db_ids = {}
-            with db_mod.transaction(db):
-                for i, sym in enumerate(raw_symbols):
-                    parent_id = db_ids.get(sym["parent_idx"]) if sym["parent_idx"] is not None else None
-                    sym_id = db_mod.upsert_symbol(
-                        db, sym["name"], sym["kind"], file_id,
-                        sym["line_start"], sym["line_end"],
-                        parent_id, sym["source_text"],
-                        auto_commit=False
-                    )
-                    db_ids[i] = sym_id
-                    db_mod.upsert_embedding(db, sym_id, embeddings[i], auto_commit=False)
-                    symbols_indexed += 1
-
-        # Extract and store references (also batched)
-        refs = _extract_references(tree.root_node, source_bytes)
-        if refs:
-            with db_mod.transaction(db):
-                for ref in refs:
-                    db_mod.upsert_reference(db, ref["name"], file_id, ref["line"], auto_commit=False)
-                    references_indexed += 1
-
-    else:
-        # ── Fallback: index entire file as one symbol ─────────────────
-        basename = os.path.basename(filepath)
-        embeddings = db_mod.embed_texts_batch([f"file {basename}: {source_text[:1000]}"], task_type="code2code")
-
-        with db_mod.transaction(db):
-            sym_id = db_mod.upsert_symbol(
-                db, basename, "file", file_id,
-                1, source_text.count("\n") + 1,
-                None, source_text[:5000],
-                auto_commit=False
-            )
-            db_mod.upsert_embedding(db, sym_id, embeddings[0], auto_commit=False)
-            symbols_indexed += 1
-
-    return {
-        "file": filepath,
-        "symbols_indexed": symbols_indexed,
-        "references_indexed": references_indexed,
-        "skipped": False,
-    }
-
-
 # ---------------------------------------------------------------------------
 # Directory indexer
 # ---------------------------------------------------------------------------
@@ -457,7 +341,7 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
     embedding generation sequential (sentence-transformers releases GIL during
     inference). Processes files in batches for embedding efficiency.
 
-    Skips directories in ``_SKIP_DIRS``, files matching ``.gitignore`` patterns
+    Skips directories in ``SKIP_DIRS``, files matching ``.gitignore`` patterns
     (including nested .gitignore files), and unchanged files.  Indexes any file
     with a recognised source-code extension.
 
@@ -467,7 +351,8 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
         progress_callback: Optional callback(current, total, message) for progress updates.
 
     Returns:
-        A list of per-file result dicts (see :func:`index_file`).
+        A list of per-file result dicts (see :func:`_parse_file_for_indexing`
+        and :func:`_store_parsed_file`).
     """
     import time
 
@@ -486,7 +371,7 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
         rel_root = os.path.relpath(root, dirpath)
         if rel_root != ".":
             gitignore.check_dir_for_gitignore(root, rel_root)
-        dirs[:] = [d for d in dirs if d not in _SKIP_DIRS and not d.endswith(".egg-info")
+        dirs[:] = [d for d in dirs if d not in SKIP_DIRS and not d.endswith(".egg-info")
                    and not gitignore.should_skip(os.path.join(rel_root, d) if rel_root != "." else d, is_dir=True)]
         for fname in sorted(files):
             rel_path = os.path.join(rel_root, fname) if rel_root != "." else fname
@@ -593,6 +478,20 @@ def _parse_file_task(fpath: str) -> tuple[str, dict | None, Exception | None]:
         file_result = _store_parsed_file(fpath, parsed_data, db, file_embeddings)
         results.append(file_result)
 
+    # Phase 4: Clean up stale files (deleted from disk but still in index)
+    stale_count = 0
+    rows = db.execute("SELECT id, path FROM files").fetchall()
+    for file_id, path in rows:
+        if not path.startswith(dirpath + os.sep) and path != dirpath:
+            continue
+        if not os.path.exists(path):
+            db_mod.delete_file_data(db, file_id)
+            db.execute("DELETE FROM files WHERE id = ?", (file_id,))
+            db.commit()
+            stale_count += 1
+    if stale_count:
+        logger.info("Cleaned up %d stale file(s) no longer on disk", stale_count)
+
     # Log performance summary
     total_elapsed = time.perf_counter() - total_start
     total_symbols = sum(r.get("symbols_indexed", 0) for r in results)
diff --git a/queries.py b/queries.py
@@ -190,6 +190,7 @@ def hybrid_search(query: str, db, top_k: int = 10, rerank: bool = True) -> list[
         confidence = round(normalized_score / 100.0, 3)
 
         result = {
+            "symbol_id": sid,
             **details[sid],
             "score": round(normalized_score, 1),
             "match_reason": match_reason,
@@ -400,6 +401,7 @@ def find_references(symbol_name: str, db, include_context: bool = True) -> list[
 
     # Enrich with context
     enriched = []
+    file_cache: dict[str, list[str] | None] = {}
     for r in rows:
         ref = {
             "symbol_name": r[0],
@@ -409,14 +411,18 @@ def find_references(symbol_name: str, db, include_context: bool = True) -> list[
             "containing_symbol": None,
         }
 
-        # Get the source line at this reference
-        try:
-            with open(r[1]) as f:
-                lines = f.readlines()
-                if 0 < r[2] <= len(lines):
-                    ref["source_line"] = lines[r[2] - 1].strip()
-        except Exception:
-            pass
+        # Get the source line at this reference (cached per file)
+        file_path = r[1]
+        if file_path not in file_cache:
+            try:
+                with open(file_path) as f:
+                    file_cache[file_path] = f.readlines()
+            except Exception:
+                file_cache[file_path] = None
+
+        cached_lines = file_cache[file_path]
+        if cached_lines and 0 < r[2] <= len(cached_lines):
+            ref["source_line"] = cached_lines[r[2] - 1].strip()
 
         # Find containing symbol
         containing = db.execute(