diff --git a/java_index_flow_lancedb.py b/java_index_flow_lancedb.py index 3308118..ea2ab2f 100644 --- a/java_index_flow_lancedb.py +++ b/java_index_flow_lancedb.py @@ -60,16 +60,21 @@ PROJECT_ROOT = coco.ContextKey[Path]("java_lance_project_root") LANCE_DB = coco.ContextKey("java_lance_async_conn") EMBEDDER = coco.ContextKey[SentenceTransformerEmbedder]("java_lance_embedder") + IGNORE = coco.ContextKey[LayeredIgnore]("java_lance_layered_ignore") elif "tracked" in _ck_params: PROJECT_ROOT = coco.ContextKey[Path]("java_lance_project_root", tracked=False) LANCE_DB = coco.ContextKey("java_lance_async_conn", tracked=False) EMBEDDER = coco.ContextKey[SentenceTransformerEmbedder]( "java_lance_embedder", tracked=False ) + IGNORE = coco.ContextKey[LayeredIgnore]( + "java_lance_layered_ignore", tracked=False + ) else: PROJECT_ROOT = coco.ContextKey[Path]("java_lance_project_root") LANCE_DB = coco.ContextKey("java_lance_async_conn") EMBEDDER = coco.ContextKey[SentenceTransformerEmbedder]("java_lance_embedder") + IGNORE = coco.ContextKey[LayeredIgnore]("java_lance_layered_ignore") splitter = RecursiveSplitter() @@ -292,6 +297,7 @@ async def coco_lifespan(builder: coco.EnvironmentBuilder) -> AsyncIterator[None] trust_remote_code=True, ) builder.provide(EMBEDDER, embedder) + builder.provide(IGNORE, LayeredIgnore(root)) uri = str(index_dir) @@ -348,7 +354,8 @@ async def process_java_file( ) -> None: embedder = coco.use_context(EMBEDDER) project_root = coco.use_context(PROJECT_ROOT) - if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve()): + ignore = coco.use_context(IGNORE) + if ignore.is_ignored((project_root / file.file_path.path).resolve()): return try: content = await file.read_text() @@ -420,7 +427,8 @@ async def process_sql_file( ) -> None: embedder = coco.use_context(EMBEDDER) project_root = coco.use_context(PROJECT_ROOT) - if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve()): + ignore = coco.use_context(IGNORE) + if ignore.is_ignored((project_root / file.file_path.path).resolve()): return try: content = await file.read_text() @@ -468,7 +476,8 @@ async def process_yaml_file( ) -> None: embedder = coco.use_context(EMBEDDER) project_root = coco.use_context(PROJECT_ROOT) - if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve()): + ignore = coco.use_context(IGNORE) + if ignore.is_ignored((project_root / file.file_path.path).resolve()): return try: content = await file.read_text() diff --git a/path_filtering.py b/path_filtering.py index 4ff36db..57dadf2 100644 --- a/path_filtering.py +++ b/path_filtering.py @@ -300,6 +300,7 @@ def __init__( _scan_negation_any_bundle_ignore(self.project_root) or (use_gitignore and _scan_negation_any_gitignore(self.project_root)) ) + self._mega_cache: dict[str, tuple[list[str], GitIgnoreSpec, list[tuple[str, Path | None, int, str]]]] = {} def cocoindex_excluded_patterns(self) -> list[str]: """Patterns for CocoIndex ``PatternFilePathMatcher.excluded_patterns``. @@ -332,6 +333,11 @@ def _path_for_display(self, path: Path | None) -> str: return path.as_posix() def _mega(self, rel_project: str) -> tuple[list[str], GitIgnoreSpec, list[tuple[str, Path | None, int, str]]]: + # Cache by directory (parent of rel_project). _mega_build_for_rel reads only dir_parts, + # so files in the same directory share the same mega/spec/meta tuple. + cache_key = Path(rel_project).parent.as_posix() + if cache_key in self._mega_cache: + return self._mega_cache[cache_key] mega, meta = _mega_build_for_rel( self.project_root, rel_project, @@ -340,7 +346,9 @@ def _mega(self, rel_project: str) -> tuple[list[str], GitIgnoreSpec, list[tuple[ project_ignore_path=self._project_ignore_path, project_lines=self._project_lines, ) - return mega, GitIgnoreSpec.from_lines(mega), meta + result = (mega, GitIgnoreSpec.from_lines(mega), meta) + self._mega_cache[cache_key] = result + return result def is_ignored(self, path: Path) -> bool: """Return whether ``path`` is ignored by any configured layer. diff --git a/tests/test_lancedb_e2e.py b/tests/test_lancedb_e2e.py index 5ba0d9b..0e1d8c5 100644 --- a/tests/test_lancedb_e2e.py +++ b/tests/test_lancedb_e2e.py @@ -345,3 +345,44 @@ async def test_search_returns_multiple_hits(lance_index: Path, monkeypatch) -> N ) assert out["success"] is True assert len(out["results"]) >= 1 + + +def test_layered_ignore_provided_once_per_flow() -> None: + """Source-structure assertion that IGNORE is provided once and consumed three times. + + This test verifies the wiring invariant (IGNORE ContextKey provided once in + coco_lifespan, consumed in three process_*_file sites) by inspecting the flow + module source code. The behavioral guarantee (a single LayeredIgnore instance + per flow run) is backed by the HEAVY e2e test below and the sentinel grep. + + This approach is used because in-process testing of coco_lifespan would require + stubbing the embedder/LanceDB setup, and subprocess-based testing cannot cross + the process boundary to instrument LayeredIgnore.__init__. + """ + bundle_dir = Path(__file__).resolve().parent.parent + flow_file = bundle_dir / "java_index_flow_lancedb.py" + if not flow_file.is_file(): + pytest.skip(f"Flow file not found: {flow_file}") + + source = flow_file.read_text(encoding="utf-8") + + # Count builder.provide(IGNORE, ...) calls - should be exactly one (in coco_lifespan) + provide_count = source.count("builder.provide(IGNORE,") + assert provide_count == 1, f"Expected 1 builder.provide(IGNORE,) call, found {provide_count}" + + # Count coco.use_context(IGNORE) calls - should be exactly three (process_*_file) + use_count = source.count("coco.use_context(IGNORE)") + assert use_count == 3, f"Expected 3 coco.use_context(IGNORE) calls, found {use_count}" + + # Verify no leftover LayeredIgnore(project_root).is_ignored calls in process sites + # (the sentinel grep would catch this, but we assert it here for completeness) + lines = source.split("\n") + for i, line in enumerate(lines, 1): + if "def process_" in line and "file(" in line: + # Found a process_*_file function definition + # Check the next ~10 lines for the old pattern + func_body = "\n".join(lines[i-1:min(i+10, len(lines))]) + if "LayeredIgnore(project_root).is_ignored" in func_body: + pytest.fail(f"Found LayeredIgnore(project_root).is_ignored in process_*_file at line {i}") + + # All structure checks passed diff --git a/tests/test_path_filtering.py b/tests/test_path_filtering.py index 78e40f3..f44640a 100644 --- a/tests/test_path_filtering.py +++ b/tests/test_path_filtering.py @@ -258,3 +258,99 @@ def test_unconditional_prune_dirs_remain_pruned_anywhere(tmp_path: Path) -> None li = LayeredIgnore(root, use_gitignore=False) files = list(iter_java_source_files(root, ignore=li)) assert files == [] + + +def test_is_ignored_mega_caches_by_directory(tmp_path: Path) -> None: + """Assert _mega is computed once per directory and subsequent same-dir calls hit cache.""" + root = tmp_path / "p" + root.mkdir() + (root / ".java-codebase-rag" / "ignore").parent.mkdir(parents=True) + (root / ".java-codebase-rag" / "ignore").write_text("**/Generated*.java\n", encoding="utf-8") + + dir1 = root / "src" / "main" + dir1.mkdir(parents=True) + file1 = dir1 / "GeneratedFoo.java" + file1.write_text("class GeneratedFoo {}\n", encoding="utf-8") + + file2 = dir1 / "GeneratedBar.java" + file2.write_text("class GeneratedBar {}\n", encoding="utf-8") + + dir2 = root / "src" / "test" + dir2.mkdir(parents=True) + file3 = dir2 / "GeneratedTest.java" + file3.write_text("class GeneratedTest {}\n", encoding="utf-8") + + li = LayeredIgnore(root, use_gitignore=False) + + # Clear the cache to start fresh + li._mega_cache.clear() + + # First call for file1 in dir1 should cache the result + assert li.is_ignored(file1) is True + # Second call for file2 in same dir should hit cache (same cache key) + assert li.is_ignored(file2) is True + # Call for file3 in different dir should compute and cache separately + assert li.is_ignored(file3) is True + + # Should have exactly 2 cache entries (one per directory) + assert len(li._mega_cache) == 2 + # Cache keys should be the parent directories + assert "src/main" in li._mega_cache + assert "src/test" in li._mega_cache + + +def test_layered_ignore_memo_preserves_decisions(tmp_path: Path) -> None: + """For a corpus with nested ignore + gitignore negations, assert is_ignored is + identical with and without the cache.""" + root = tmp_path / "p" + root.mkdir() + + # Project root ignores all Generated*.java + pr = root / ".java-codebase-rag" / "ignore" + pr.parent.mkdir(parents=True) + pr.write_text("**/Generated*.java\n", encoding="utf-8") + + # Nested dir negates for a specific subdirectory + nested = root / "svc" / ".java-codebase-rag" / "ignore" + nested.parent.mkdir(parents=True) + nested.write_text("!**/Generated*.java\n", encoding="utf-8") + + # Gitignore at root adds another pattern + (root / ".gitignore").write_text("**/customout/**\n", encoding="utf-8") + + # Create test files + dir1 = root / "svc" / "src" + dir1.mkdir(parents=True) + hit1 = dir1 / "GeneratedFoo.java" + hit1.write_text("class GeneratedFoo {}\n", encoding="utf-8") + + dir2 = root / "svc" / "customout" + dir2.mkdir(parents=True) + hit2 = dir2 / "X.java" + hit2.write_text("class X {}\n", encoding="utf-8") + + dir3 = root / "other" / "src" + dir3.mkdir(parents=True) + hit3 = dir3 / "GeneratedBar.java" + hit3.write_text("class GeneratedBar {}\n", encoding="utf-8") + + # Test with gitignore enabled (cached) + li_cached = LayeredIgnore(root, use_gitignore=True) + assert li_cached.is_ignored(hit1) is False # negated by nested + assert li_cached.is_ignored(hit2) is True # gitignore pattern + assert li_cached.is_ignored(hit3) is True # project-root pattern + + # Test without cache by creating a fresh instance each time (simulates old behavior) + li_uncached1 = LayeredIgnore(root, use_gitignore=True) + assert li_uncached1.is_ignored(hit1) is False + + li_uncached2 = LayeredIgnore(root, use_gitignore=True) + assert li_uncached2.is_ignored(hit2) is True + + li_uncached3 = LayeredIgnore(root, use_gitignore=True) + assert li_uncached3.is_ignored(hit3) is True + + # Verify cached vs uncached results match + assert li_cached.is_ignored(hit1) == li_uncached1.is_ignored(hit1) + assert li_cached.is_ignored(hit2) == li_uncached2.is_ignored(hit2) + assert li_cached.is_ignored(hit3) == li_uncached3.is_ignored(hit3)