diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index b6b2cde9..68a3b2dd 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -87,7 +87,9 @@ A single file at the project root (the directory you pass as `--source-root`, or # Index directory: where Lance tables, code_graph.kuzu, and cocoindex.db live. # - Tilde (`~`) is expanded; `$VAR` is NOT (use absolute paths or `~`). -# - Relative paths resolve against source_root, not cwd. +# - Relative paths resolve against the config file's parent directory (same +# base as source_root), not cwd. The bare default ./.java-codebase-rag +# (when this key is omitted) still sits beside the resolved source_root. # - Env: JAVA_CODEBASE_RAG_INDEX_DIR. CLI: --index-dir. Default: ./.java-codebase-rag/ index_dir: ./.java-codebase-rag @@ -217,7 +219,7 @@ async_producer_overrides: | Field | Expanded? | Notes | |---|---|---| -| `index_dir` | partial | `~` expanded; `$VAR` is NOT expanded. Relative paths resolve against `source_root`. | +| `index_dir` | partial | `~` expanded; `$VAR` is NOT expanded. A YAML relative path resolves against the config file's directory (same base as `source_root`); the default `./.java-codebase-rag` sits beside the resolved `source_root`. | | `embedding.model` (when path-shaped) | yes | Path-shape = starts with `/`, `./`, `../`, `~`, or contains `$`. Plain `org/name` is treated as a hub id and passed through. Applies to the value after CLI > env > YAML > default precedence. Long-lived MCP hosts also apply the same expansion when reading `SBERT_MODEL` from the process environment (so table metadata and search agree with `index_common` defaults). | | `embedding.device` | n/a | Device strings (`cpu`, `cuda`, `mps`) aren't paths. | | `microservice_roots[*]` | no | Each entry is a directory **name** relative to `source_root`, not an arbitrary path. | @@ -225,7 +227,7 @@ async_producer_overrides: **Tips & gotchas:** -- **The file must be at `source_root`**, not in `$HOME`. The MCP server reads `JAVA_CODEBASE_RAG_SOURCE_ROOT` to find it; the CLI uses `--source-root` (else cwd). +- **The config file may live anywhere under your project, including a subdirectory of the Java tree.** Both the CLI (`init` / `increment` / `reprocess`) and the MCP server walk up from cwd to find `.java-codebase-rag.yml`, then resolve `source_root` and `index_dir` relative to the config file's directory. So a config living in `my-context/` next to `source_root: ../` and `index_dir: ../.java-codebase-rag` resolves identically for the CLI and the MCP server. Keep the file under your project (not `$HOME`); set `JAVA_CODEBASE_RAG_SOURCE_ROOT` (MCP) or `--source-root` (CLI) only to override the discovered location. - **Don't commit secrets** into this YAML — it sits next to your source tree and is read by every operator who clones it. - **Rebuild after editing brownfield overrides.** Run a full `java-codebase-rag reprocess` (no flags) so Lance and Kuzu stay coherent, or use `--graph-only` / `--vectors-only` when you know only one store needs invalidation. Editing `embedding.model` requires a vector rebuild (`reprocess` or `--vectors-only`). - **Diagnose what's loaded.** `java-codebase-rag meta` prints the resolved config and each value's `*_source` (`cli` / `env` / `yaml` / `default`) — see `embedding_model_source`, `embedding_device_source`, `index_dir_source`. diff --git a/java_codebase_rag/config.py b/java_codebase_rag/config.py index 6603dbc3..23a9308a 100644 --- a/java_codebase_rag/config.py +++ b/java_codebase_rag/config.py @@ -306,9 +306,19 @@ def _pick_bool( def _resolve_index_dir_path( *, source_root: Path, + config_dir: Path, cli_index_dir: str | None, yaml_dict: dict[str, Any], ) -> tuple[Path, SettingSource]: + # Bases for relative paths: + # - YAML ``index_dir`` -> the config file's directory (``config_dir``), + # the SAME base used for YAML ``source_root``. Paths written in the + # config file are relative to the file, so both keys stay consistent. + # - CLI / env ``index_dir`` -> ``source_root`` (unchanged). These are not + # "in the config file"; preserving the existing base avoids a semantics + # change for operators who pass ``--index-dir`` on the command line. + # - Default ``./.java-codebase-rag`` -> ``source_root`` so the index sits + # beside the Java tree (the layout ``discover_project_root`` anchors on). raw_cli = cli_index_dir.strip() if isinstance(cli_index_dir, str) else None if raw_cli: p = Path(raw_cli).expanduser() @@ -324,7 +334,7 @@ def _resolve_index_dir_path( idx = yaml_dict.get("index_dir") if isinstance(idx, str) and idx.strip(): p = Path(idx.strip()).expanduser() - out = p.resolve() if p.is_absolute() else (source_root / p).resolve() + out = p.resolve() if p.is_absolute() else (config_dir / p).resolve() return out, "yaml" return (source_root / ".java-codebase-rag").resolve(), "default" @@ -368,7 +378,7 @@ def resolve_operator_config( root = config_dir index_dir, index_src = _resolve_index_dir_path( - source_root=root, cli_index_dir=cli_index_dir, yaml_dict=yaml_dict + source_root=root, config_dir=config_dir, cli_index_dir=cli_index_dir, yaml_dict=yaml_dict ) model, model_src = _pick_str( cli_val=cli_embedding_model, diff --git a/server.py b/server.py index 69675648..c151f941 100644 --- a/server.py +++ b/server.py @@ -155,6 +155,27 @@ def _project_root() -> Path: return discovered if discovered is not None else Path.cwd().resolve() +def _source_root_for_operator_config() -> Path | None: + """``source_root`` arg to hand ``resolve_operator_config`` from the MCP server. + + Returns ``JAVA_CODEBASE_RAG_SOURCE_ROOT`` when set (an explicit operator + override that wins and suppresses the YAML ``source_root`` field, exactly + like CLI ``--source-root``), otherwise ``None`` — so + ``resolve_operator_config`` runs its OWN walk-up discovery and HONORS the + YAML ``source_root`` field, matching the CLI (``init`` / ``increment`` / + ``reprocess``) path. + + Do NOT pass ``_project_root()`` (the walk-up-discovered dir) here: a + non-``None`` value routes into the "explicit source root" branch that + skips the YAML ``source_root`` field, which made the MCP server and the + CLI resolve different ``source_root`` / ``index_dir`` from the same config + file (the init-vs-MCP index_dir divergence). ``_project_root()`` is kept + only for the ``_resolve_lancedb_uri()`` fallback below. + """ + env = os.environ.get("JAVA_CODEBASE_RAG_SOURCE_ROOT", "").strip() + return Path(env).expanduser().resolve() if env else None + + def _cocoindex_subprocess_env(project_root: Path) -> dict[str, str]: sub_env = os.environ.copy() sub_env["JAVA_CODEBASE_RAG_SOURCE_ROOT"] = str(project_root) @@ -654,7 +675,7 @@ def main() -> None: # Load YAML config and apply embedding settings to environment # This ensures SBERT_MODEL and SBERT_DEVICE from .java-codebase-rag.yml are available # before any tool handler runs (same behavior as CLI path) - cfg = resolve_operator_config(source_root=_project_root()) + cfg = resolve_operator_config(source_root=_source_root_for_operator_config()) cfg.apply_to_os_environ() mcp_v2.set_hints_enabled(cfg.hints_enabled) diff --git a/tests/test_config.py b/tests/test_config.py index 4b2aefc8..a07d2f41 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -175,6 +175,57 @@ def test_source_root_from_yaml_absolute(self, tmp_path, monkeypatch): assert result.source_root == Path(absolute_path) +class TestIndexDirRelativeToConfigDir: + """YAML ``index_dir`` must resolve against the config file's directory. + + ``source_root`` already resolves against the config dir (see + ``TestSourceRootFromYaml``). ``index_dir`` must use the SAME base so a + user can express both keys relative to the config file — otherwise a + ``../`` in ``index_dir`` gets re-applied on top of the already-resolved + ``source_root`` and overshoots by one level (the "init indexes ~/" + symptom when the config lives in a subdirectory of the Java tree). + """ + + def test_yaml_index_dir_double_dot_resolves_against_config_dir(self, tmp_path, monkeypatch): + """``index_dir: ../x`` is relative to the config file's directory, not source_root.""" + monkeypatch.delenv("JAVA_CODEBASE_RAG_INDEX_DIR", raising=False) + monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False) + + config_dir = tmp_path / "my-project-context" + config_dir.mkdir() + (config_dir / YAML_CONFIG_FILENAMES[0]).write_text( + "source_root: ../\nindex_dir: ../.java-codebase-rag\n" + ) + monkeypatch.chdir(config_dir) + + result = resolve_operator_config(source_root=None) + # source_root ../ -> tmp_path (one level above the config file) + assert result.source_root == tmp_path + # index_dir ../ -> tmp_path/.java-codebase-rag (one level above the config file), + # NOT tmp_path.parent/.java-codebase-rag (which is what resolving against + # the already-resolved source_root would produce). + assert result.index_dir == (tmp_path / ".java-codebase-rag").resolve() + + def test_yaml_index_dir_bare_resolves_against_config_dir(self, tmp_path, monkeypatch): + """``index_dir: x`` (no ``../``) sits next to the config file.""" + monkeypatch.delenv("JAVA_CODEBASE_RAG_INDEX_DIR", raising=False) + monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False) + + config_dir = tmp_path / "my-project-context" + config_dir.mkdir() + (config_dir / YAML_CONFIG_FILENAMES[0]).write_text( + "source_root: ../\nindex_dir: .java-codebase-rag\n" + ) + monkeypatch.chdir(config_dir) + + result = resolve_operator_config(source_root=None) + assert result.source_root == tmp_path + # Bare path resolves against the config dir, so the index sits beside + # the config file — NOT beside source_root. + assert result.index_dir == (config_dir / ".java-codebase-rag").resolve() + assert result.index_dir_source == "yaml" + + class TestSourceRootPrecedence: """Tests for source_root precedence chain.""" diff --git a/tests/test_mcp_server_project_root.py b/tests/test_mcp_server_project_root.py index 48d90d8d..df3b45c1 100644 --- a/tests/test_mcp_server_project_root.py +++ b/tests/test_mcp_server_project_root.py @@ -1,6 +1,6 @@ """Tests for server.py _project_root() function in the MCP server context.""" -from java_codebase_rag.config import YAML_CONFIG_FILENAMES +from java_codebase_rag.config import YAML_CONFIG_FILENAMES, resolve_operator_config class TestProjectRoot: @@ -23,3 +23,59 @@ def test_project_root_uses_discover_when_env_unset(self, tmp_path, monkeypatch): result = _project_root() assert result == tmp_path + + +class TestSourceRootForOperatorConfig: + """The MCP server must honor the YAML ``source_root`` field like the CLI. + + ``main()`` passes ``_source_root_for_operator_config()`` (not the + walk-up-discovered dir) as the ``source_root`` arg to + ``resolve_operator_config``. When the env override is unset that is + ``None``, which routes through the walk-up branch that APPLIES the YAML + ``source_root`` field. Passing the discovered dir instead would route into + the "explicit source root" branch and silently ignore the YAML field, + diverging the MCP server from ``init``/``increment``/``reprocess``. + """ + + def test_returns_none_when_env_unset(self, tmp_path, monkeypatch): + monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False) + from server import _source_root_for_operator_config + + assert _source_root_for_operator_config() is None + + def test_returns_env_path_when_set(self, tmp_path, monkeypatch): + explicit = tmp_path / "explicit-root" + explicit.mkdir() + monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(explicit)) + from server import _source_root_for_operator_config + + assert _source_root_for_operator_config() == explicit.resolve() + + def test_mcp_and_init_resolve_identically_for_nested_config(self, tmp_path, monkeypatch): + """Regression for the init-vs-MCP index_dir divergence. + + Config lives in a subdirectory of the Java tree (``my-project-context/``) + and points both ``source_root`` and ``index_dir`` one level up. The MCP + server (env unset) and the CLI must resolve the SAME source_root and + index_dir, landing on the real index at ``tmp_path/.java-codebase-rag``. + """ + monkeypatch.delenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", raising=False) + monkeypatch.delenv("JAVA_CODEBASE_RAG_INDEX_DIR", raising=False) + + config_dir = tmp_path / "my-project-context" + config_dir.mkdir() + (config_dir / YAML_CONFIG_FILENAMES[0]).write_text( + "source_root: ../\nindex_dir: ../.java-codebase-rag\n" + ) + monkeypatch.chdir(config_dir) + + from server import _source_root_for_operator_config + + mcp = resolve_operator_config(source_root=_source_root_for_operator_config()) + cli = resolve_operator_config(source_root=None) + + assert mcp.source_root == tmp_path + assert mcp.index_dir == (tmp_path / ".java-codebase-rag").resolve() + assert mcp.source_root == cli.source_root + assert mcp.index_dir == cli.index_dir +