Skip to content

Commit e084396

Browse files
kapillamba4claude
andcommitted
Fix 6 code quality issues: dead code, missing IDs, perf, cleanup
- Remove dead `index_file()` function (replaced by `_parse_file_for_indexing` + `_store_parsed_file`) - Add `symbol_id` to `hybrid_search` results so `find_definition` doesn't need ambiguous fallback lookup - Cache file contents in `find_references()` to avoid re-reading the same file per reference - Add LRU cache (128 entries) on `embed_text()` to skip redundant model calls for repeated queries - Unify duplicate `SKIP_DIRS` into a single constant in `parser.py`, imported by `doc_parser.py` - Add stale file cleanup pass in both `index_directory()` and `index_doc_directory()` to remove ghost entries for deleted files Co-Authored-By: Claude Opus 4.6 <[email protected]>
1 parent cbca379 commit e084396

4 files changed

Lines changed: 63 additions & 147 deletions

File tree

db.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import sqlite3
1717
import sys
1818
from contextlib import contextmanager
19+
from functools import lru_cache
1920
from typing import TYPE_CHECKING
2021

2122
import sqlite_vec
@@ -150,19 +151,26 @@ def get_embedding_dim() -> int:
150151
return _embedding_dim
151152

152153

154+
@lru_cache(maxsize=128)
155+
def _embed_text_cached(text: str, task_type: str) -> tuple[float, ...]:
156+
"""Cached embedding computation. Returns tuple for hashability."""
157+
model = get_embedding_model()
158+
prefixed_text = f"{task_type}: {text}"
159+
vec = model.encode(prefixed_text, normalize_embeddings=True, show_progress_bar=False)
160+
return tuple(vec.tolist())
161+
162+
153163
def embed_text(text: str, task_type: str = "nl2code") -> list[float]:
154164
"""Generate a dense vector embedding for *text*.
155165
156166
Uses jina-code-embeddings with task prefix for better code retrieval.
167+
Results are cached (LRU, 128 entries) to avoid redundant model calls.
157168
158169
Args:
159170
text: The text to embed.
160171
task_type: One of 'nl2code', 'code2code', 'code2nl', 'code2completion', 'qa'.
161172
"""
162-
model = get_embedding_model()
163-
prefixed_text = f"{task_type}: {text}"
164-
vec = model.encode(prefixed_text, normalize_embeddings=True, show_progress_bar=False)
165-
return vec.tolist()
173+
return list(_embed_text_cached(text, task_type))
166174

167175

168176
def embed_texts_batch(

doc_parser.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,32 +7,21 @@
77

88
from __future__ import annotations
99

10+
import logging
1011
import os
1112
import re
1213

1314
from markdown_it import MarkdownIt
1415

1516
import db as db_mod
16-
from parser import GitignoreMatcher
17+
from parser import SKIP_DIRS, GitignoreMatcher
18+
19+
logger = logging.getLogger(__name__)
1720

1821
# ---------------------------------------------------------------------------
1922
# Configuration
2023
# ---------------------------------------------------------------------------
2124

22-
SKIP_DIRS = {
23-
".venv",
24-
"venv",
25-
"__pycache__",
26-
".git",
27-
"node_modules",
28-
"build",
29-
"dist",
30-
"target",
31-
".mypy_cache",
32-
".pytest_cache",
33-
".ruff_cache",
34-
}
35-
3625
DOC_EXTENSIONS = {".md", ".markdown"}
3726
README_PATTERN = re.compile(r"^readme(\.md|\.markdown|\.txt)?$", re.IGNORECASE)
3827

@@ -393,6 +382,20 @@ def index_doc_directory(dirpath: str, db, progress_callback=None, progress_offse
393382
current = progress_offset + i + 1
394383
progress_callback(current, progress_total, f"Indexing docs: {os.path.basename(filepath)}")
395384

385+
# Clean up stale doc files (deleted from disk but still in index)
386+
stale_count = 0
387+
rows = db.execute("SELECT id, path FROM doc_files").fetchall()
388+
for doc_file_id, path in rows:
389+
if not path.startswith(abs_dir + os.sep) and path != abs_dir:
390+
continue
391+
if not os.path.exists(path):
392+
db_mod.delete_doc_file_data(db, doc_file_id)
393+
db.execute("DELETE FROM doc_files WHERE id = ?", (doc_file_id,))
394+
db.commit()
395+
stale_count += 1
396+
if stale_count:
397+
logger.info("Cleaned up %d stale doc file(s) no longer on disk", stale_count)
398+
396399
return results
397400

398401

parser.py

Lines changed: 19 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
MAX_WORKERS = int(os.environ.get("CODE_MEMORY_MAX_WORKERS", "4"))
2727

2828
# ── Directories to always skip (even without .gitignore) ───────────────
29-
_SKIP_DIRS = frozenset({
29+
SKIP_DIRS = frozenset({
3030
".venv", "venv", "__pycache__", ".git", "node_modules",
3131
".mypy_cache", ".pytest_cache", ".ruff_cache", ".tox",
3232
"dist", "build", "target", "bin", "obj",
@@ -330,122 +330,6 @@ def _walk(node: Node):
330330
return refs
331331

332332

333-
# ---------------------------------------------------------------------------
334-
# Single-file indexer
335-
# ---------------------------------------------------------------------------
336-
337-
def index_file(filepath: str, db) -> dict:
338-
"""Parse a single source file and index its symbols + references.
339-
340-
Optimized version using batch embeddings and transaction-based writes.
341-
342-
Uses tree-sitter when a grammar is available for the file's language.
343-
Falls back to indexing the whole file as a single symbol otherwise.
344-
Skips the file if its ``last_modified`` timestamp has not changed.
345-
346-
Args:
347-
filepath: Absolute path to a source file.
348-
db: An open ``sqlite3.Connection`` from ``db.get_db()``.
349-
350-
Returns:
351-
A dict with ``file``, ``symbols_indexed``, ``references_indexed``,
352-
and ``skipped`` keys.
353-
"""
354-
filepath = os.path.abspath(filepath)
355-
ext = os.path.splitext(filepath)[1].lower()
356-
357-
# ── Check freshness ───────────────────────────────────────────────
358-
mtime = os.path.getmtime(filepath)
359-
row = db.execute(
360-
"SELECT id, last_modified FROM files WHERE path = ?", (filepath,)
361-
).fetchone()
362-
363-
if row and row[1] >= mtime:
364-
return {"file": filepath, "symbols_indexed": 0,
365-
"references_indexed": 0, "skipped": True}
366-
367-
# ── Read file ─────────────────────────────────────────────────────
368-
source_bytes = Path(filepath).read_bytes()
369-
source_text = source_bytes.decode("utf-8", errors="replace")
370-
371-
fhash = db_mod.file_hash(filepath) # Now uses xxHash
372-
file_id = db_mod.upsert_file(db, filepath, mtime, fhash)
373-
374-
# Delete stale data before re-inserting
375-
db_mod.delete_file_data(db, file_id)
376-
377-
symbols_indexed = 0
378-
references_indexed = 0
379-
380-
# ── Try tree-sitter parsing ───────────────────────────────────────
381-
lang = _load_language(ext)
382-
383-
if lang is not None:
384-
parser = Parser(lang)
385-
tree = parser.parse(source_bytes)
386-
387-
# Extract symbols
388-
raw_symbols = _extract_symbols(tree.root_node, source_bytes)
389-
390-
# === BATCH PROCESSING ===
391-
all_embed_inputs = []
392-
for sym in raw_symbols:
393-
embed_input = f"{sym['kind']} {sym['name']}: {sym['source_text'][:1000]}"
394-
all_embed_inputs.append(embed_input)
395-
396-
# Batch embed all at once
397-
# Use code2code task_type for code content at index time.
398-
# Query time uses nl2code (natural language -> code), so index time
399-
# should use code2code (code -> code) to place vectors in the correct subspace.
400-
if all_embed_inputs:
401-
embeddings = db_mod.embed_texts_batch(all_embed_inputs, batch_size=64, task_type="code2code")
402-
403-
# Store all in single transaction
404-
db_ids = {}
405-
with db_mod.transaction(db):
406-
for i, sym in enumerate(raw_symbols):
407-
parent_id = db_ids.get(sym["parent_idx"]) if sym["parent_idx"] is not None else None
408-
sym_id = db_mod.upsert_symbol(
409-
db, sym["name"], sym["kind"], file_id,
410-
sym["line_start"], sym["line_end"],
411-
parent_id, sym["source_text"],
412-
auto_commit=False
413-
)
414-
db_ids[i] = sym_id
415-
db_mod.upsert_embedding(db, sym_id, embeddings[i], auto_commit=False)
416-
symbols_indexed += 1
417-
418-
# Extract and store references (also batched)
419-
refs = _extract_references(tree.root_node, source_bytes)
420-
if refs:
421-
with db_mod.transaction(db):
422-
for ref in refs:
423-
db_mod.upsert_reference(db, ref["name"], file_id, ref["line"], auto_commit=False)
424-
references_indexed += 1
425-
426-
else:
427-
# ── Fallback: index entire file as one symbol ─────────────────
428-
basename = os.path.basename(filepath)
429-
embeddings = db_mod.embed_texts_batch([f"file {basename}: {source_text[:1000]}"], task_type="code2code")
430-
431-
with db_mod.transaction(db):
432-
sym_id = db_mod.upsert_symbol(
433-
db, basename, "file", file_id,
434-
1, source_text.count("\n") + 1,
435-
None, source_text[:5000],
436-
auto_commit=False
437-
)
438-
db_mod.upsert_embedding(db, sym_id, embeddings[0], auto_commit=False)
439-
symbols_indexed += 1
440-
441-
return {
442-
"file": filepath,
443-
"symbols_indexed": symbols_indexed,
444-
"references_indexed": references_indexed,
445-
"skipped": False,
446-
}
447-
448-
449333
# ---------------------------------------------------------------------------
450334
# Directory indexer
451335
# ---------------------------------------------------------------------------
@@ -457,7 +341,7 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
457341
embedding generation sequential (sentence-transformers releases GIL during
458342
inference). Processes files in batches for embedding efficiency.
459343
460-
Skips directories in ``_SKIP_DIRS``, files matching ``.gitignore`` patterns
344+
Skips directories in ``SKIP_DIRS``, files matching ``.gitignore`` patterns
461345
(including nested .gitignore files), and unchanged files. Indexes any file
462346
with a recognised source-code extension.
463347
@@ -467,7 +351,8 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
467351
progress_callback: Optional callback(current, total, message) for progress updates.
468352
469353
Returns:
470-
A list of per-file result dicts (see :func:`index_file`).
354+
A list of per-file result dicts (see :func:`_parse_file_for_indexing`
355+
and :func:`_store_parsed_file`).
471356
"""
472357
import time
473358

@@ -486,7 +371,7 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
486371
rel_root = os.path.relpath(root, dirpath)
487372
if rel_root != ".":
488373
gitignore.check_dir_for_gitignore(root, rel_root)
489-
dirs[:] = [d for d in dirs if d not in _SKIP_DIRS and not d.endswith(".egg-info")
374+
dirs[:] = [d for d in dirs if d not in SKIP_DIRS and not d.endswith(".egg-info")
490375
and not gitignore.should_skip(os.path.join(rel_root, d) if rel_root != "." else d, is_dir=True)]
491376
for fname in sorted(files):
492377
rel_path = os.path.join(rel_root, fname) if rel_root != "." else fname
@@ -593,6 +478,20 @@ def _parse_file_task(fpath: str) -> tuple[str, dict | None, Exception | None]:
593478
file_result = _store_parsed_file(fpath, parsed_data, db, file_embeddings)
594479
results.append(file_result)
595480

481+
# Phase 4: Clean up stale files (deleted from disk but still in index)
482+
stale_count = 0
483+
rows = db.execute("SELECT id, path FROM files").fetchall()
484+
for file_id, path in rows:
485+
if not path.startswith(dirpath + os.sep) and path != dirpath:
486+
continue
487+
if not os.path.exists(path):
488+
db_mod.delete_file_data(db, file_id)
489+
db.execute("DELETE FROM files WHERE id = ?", (file_id,))
490+
db.commit()
491+
stale_count += 1
492+
if stale_count:
493+
logger.info("Cleaned up %d stale file(s) no longer on disk", stale_count)
494+
596495
# Log performance summary
597496
total_elapsed = time.perf_counter() - total_start
598497
total_symbols = sum(r.get("symbols_indexed", 0) for r in results)

queries.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ def hybrid_search(query: str, db, top_k: int = 10, rerank: bool = True) -> list[
190190
confidence = round(normalized_score / 100.0, 3)
191191

192192
result = {
193+
"symbol_id": sid,
193194
**details[sid],
194195
"score": round(normalized_score, 1),
195196
"match_reason": match_reason,
@@ -400,6 +401,7 @@ def find_references(symbol_name: str, db, include_context: bool = True) -> list[
400401

401402
# Enrich with context
402403
enriched = []
404+
file_cache: dict[str, list[str] | None] = {}
403405
for r in rows:
404406
ref = {
405407
"symbol_name": r[0],
@@ -409,14 +411,18 @@ def find_references(symbol_name: str, db, include_context: bool = True) -> list[
409411
"containing_symbol": None,
410412
}
411413

412-
# Get the source line at this reference
413-
try:
414-
with open(r[1]) as f:
415-
lines = f.readlines()
416-
if 0 < r[2] <= len(lines):
417-
ref["source_line"] = lines[r[2] - 1].strip()
418-
except Exception:
419-
pass
414+
# Get the source line at this reference (cached per file)
415+
file_path = r[1]
416+
if file_path not in file_cache:
417+
try:
418+
with open(file_path) as f:
419+
file_cache[file_path] = f.readlines()
420+
except Exception:
421+
file_cache[file_path] = None
422+
423+
cached_lines = file_cache[file_path]
424+
if cached_lines and 0 < r[2] <= len(cached_lines):
425+
ref["source_line"] = cached_lines[r[2] - 1].strip()
420426

421427
# Find containing symbol
422428
containing = db.execute(

0 commit comments

Comments
 (0)