refactor(scan): drop ecosystem field, fall back to dir-mode for any folder

msrkp · msrkp · commit ea1b1271bbbb · 2026-04-27T12:53:15.000+05:30
- discover(): if a path doesn't look like node_modules or site-packages, walk it
  as a single anonymous package instead of erroring out. Refines SKIP_DIRS to
  pure-metadata dirs only — .venv, node_modules, etc. stay in scope because
  malware lives there by design.
- Drop the `ecosystem` field everywhere (schema, orchestrator, CLI display,
  prompt template, tests). Discovery layout no longer needs to be carried into
  reports; package@version is enough.
- Drop --max-files-per-pkg. --budget-usd plus the preflight estimate already
  cover cost control, and the per-package cap conflicted with dir-mode.
- Bump PROMPT_VERSION to invalidate cached verdicts produced under the old
  user-prompt format.
- README: document the new discovery model + dir-mode fallback.
- Test: cover dir-mode fallback and .git exclusion.

Verified end-to-end in cull-sandbox against a Datadog compromised_lib zip
(@emilgroup/auth-sdk-node@1.21.1): discovered → estimated $0.0023 →
scanned for $0.0027 → flagged malicious with high-confidence findings
(postinstall hook, systemd persistence, npm-search worm propagation,
auto-publish hijack).
diff --git a/README.md b/README.md
@@ -37,12 +37,13 @@ LLM-scan installed package source files for suspicious supply-chain behavior.
 
 ```bash
 export GEMINI_API_KEY=...
-cull scan ./node_modules
-cull scan ./.venv/lib/python3.12/site-packages
+cull scan ./node_modules                              # npm install dir
+cull scan ./.venv/lib/python3.12/site-packages         # python install dir
+cull scan ./my-package                                 # any folder of source code
 cull scan ./node_modules ./.venv/lib/python3.12/site-packages -o report.json
 ```
 
-`PATH` must point at a package install directory: `node_modules`, `site-packages`, or a directory that clearly looks like one.
+`cull scan` auto-detects `node_modules` and `site-packages` layouts; anything else is scanned as a single anonymous folder. Pure-metadata dirs (`.git`, `__pycache__`, lint caches) are skipped at walk time. Dependency dirs (`.venv`, nested `node_modules`) are *not* — that is exactly the attack surface this tool is for. Use `--estimate-only` first to see how big the scan will be.
 
 Every run prints a preflight estimate first:
 
diff --git a/cull/llm/cli.py b/cull/llm/cli.py
@@ -22,7 +22,6 @@ def add_arguments(parser: argparse.ArgumentParser) -> None:
     parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
     parser.add_argument("--api-key-env", default="GEMINI_API_KEY")
     parser.add_argument("--concurrency", type=int, default=8)
-    parser.add_argument("--max-files-per-pkg", type=int, default=200)
     parser.add_argument("--chunk-tokens", type=int, default=4000)
     parser.add_argument("--chunk-overlap-tokens", type=int, default=600)
     parser.add_argument("--include-tests", action="store_true")
@@ -40,7 +39,6 @@ def run(args: argparse.Namespace) -> NoReturn:
         include_tests=args.include_tests,
         no_cache=args.no_cache,
         concurrency=max(1, args.concurrency),
-        max_files_per_pkg=max(1, args.max_files_per_pkg),
         chunk_tokens=max(1, args.chunk_tokens),
         chunk_overlap_tokens=max(0, args.chunk_overlap_tokens),
         budget_usd=args.budget_usd,
@@ -121,7 +119,7 @@ def _print_summary(packages: list[PackageReport], cost: dict[str, int | float |
     tprint()
     for package in [*malicious, *suspicious][:10]:
         color = red if package.verdict.level == "malicious" else yellow
-        tprint(color(f"▸ {package.package}@{package.version} ({package.ecosystem}) — {package.verdict.level}"))
+        tprint(color(f"▸ {package.package}@{package.version} — {package.verdict.level}"))
         for file in [item for item in package.files if item.verdict.level != "clean"][:3]:
             tprint(f"  {file.verdict.level}  {file.file.rel_path} ({file.verdict.confidence})")
             for finding in file.verdict.findings[:2]:
@@ -165,7 +163,7 @@ def _markdown_report(packages: list[PackageReport], cost: dict[str, int | float
     for package in packages:
         if package.verdict.level == "clean":
             continue
-        lines.append(f"## {package.package}@{package.version} ({package.ecosystem}) — {package.verdict.level}")
+        lines.append(f"## {package.package}@{package.version} — {package.verdict.level}")
         lines.append("")
         for file in package.files:
             if file.verdict.level == "clean":
diff --git a/cull/llm/discover.py b/cull/llm/discover.py
@@ -4,11 +4,17 @@
 import json
 import os
 from pathlib import Path
-from typing import cast
 
-from .schema import Ecosystem, PackageFile
+from .schema import PackageFile
 
-SKIP_DIRS = {".git", ".hg", ".svn", ".cache", "__pycache__", ".pytest_cache"}
+# Pure-metadata directories with no scannable source. We do NOT skip dependency
+# dirs (.venv, node_modules, …) — malware lives in those by design and that is
+# precisely what cull is for.
+SKIP_DIRS = {
+    ".git", ".hg", ".svn",
+    "__pycache__",
+    ".cache", ".pytest_cache", ".mypy_cache", ".ruff_cache", ".tox",
+}
 
 
 def discover(paths: list[str]) -> tuple[list[PackageFile], list[str]]:
@@ -22,18 +28,13 @@ def discover(paths: list[str]) -> tuple[list[PackageFile], list[str]]:
             errors.append(f"{raw_path}: not a directory")
             continue
 
-        matched = False
         if _looks_like_node_modules(root):
-            matched = True
             for node_modules in _node_modules_dirs(root):
                 files.extend(_discover_npm(node_modules, seen_roots))
-
-        if _looks_like_site_packages(root):
-            matched = True
+        elif _looks_like_site_packages(root):
             files.extend(_discover_python(root))
-
-        if not matched:
-            errors.append(f"{raw_path}: does not look like node_modules or site-packages")
+        else:
+            files.extend(_discover_dir(root))
 
     return files, errors
 
@@ -77,8 +78,9 @@ def _discover_npm(node_modules: Path, seen_roots: set[Path]) -> list[PackageFile
         seen_roots.add(real_root)
 
         package, version = _read_package_json(package_root)
-        for path in _walk_files(package_root):
-            files.append(_package_file("npm", package, version, package_root, path))
+        # Skip nested node_modules: they're enumerated separately by _node_modules_dirs.
+        for path in _walk_files(package_root, skip=SKIP_DIRS | {"node_modules"}):
+            files.append(_package_file(package, version, package_root, path))
     return files
 
 
@@ -112,19 +114,23 @@ def _discover_python(site_packages: Path) -> list[PackageFile]:
             if real in seen:
                 continue
             seen.add(real)
-            files.append(_package_file("python", name, version, site_packages, path))
+            files.append(_package_file(name, version, site_packages, path))
 
     for path in site_packages.glob("*.pth"):
         if path.is_file() and not path.is_symlink():
-            files.append(_package_file("python", f"pth:{path.stem}", "unknown", site_packages, path))
+            files.append(_package_file(f"pth:{path.stem}", "unknown", site_packages, path))
 
     return files
 
 
-def _walk_files(root: Path) -> list[Path]:
+def _discover_dir(root: Path) -> list[PackageFile]:
+    return [_package_file(root.name, "unknown", root, path) for path in _walk_files(root)]
+
+
+def _walk_files(root: Path, *, skip: set[str] = SKIP_DIRS) -> list[Path]:
     paths: list[Path] = []
     for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
-        dirnames[:] = [name for name in dirnames if name not in SKIP_DIRS and name != "node_modules"]
+        dirnames[:] = [name for name in dirnames if name not in skip]
         current = Path(dirpath)
         for filename in filenames:
             path = current / filename
@@ -133,9 +139,8 @@ def _walk_files(root: Path) -> list[Path]:
     return paths
 
 
-def _package_file(ecosystem: str, package: str, version: str, root: Path, path: Path) -> PackageFile:
+def _package_file(package: str, version: str, root: Path, path: Path) -> PackageFile:
     return PackageFile(
-        ecosystem=cast(Ecosystem, ecosystem),
         package=package,
         version=version,
         package_root=root,
diff --git a/cull/llm/orchestrator.py b/cull/llm/orchestrator.py
@@ -5,7 +5,6 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from pathlib import Path
-from typing import cast
 
 from .cache import VerdictCache, cache_key
 from .chunker import Chunk, chunk_text
@@ -14,7 +13,7 @@
 from .filter import read_text, should_scan
 from .pricing import RunCost, cost_for, estimate_tokens
 from .prompts import PROMPT_VERSION, SYSTEM_PROMPT, build_user_prompt
-from .schema import Ecosystem, Estimate, FileReport, PackageFile, PackageReport, Verdict, error_verdict, merge_verdicts
+from .schema import Estimate, FileReport, PackageFile, PackageReport, Verdict, error_verdict, merge_verdicts
 
 # Per-chunk output budget assumed when projecting cost or when a provider does
 # not return usage. ~180 tokens covers the JSON envelope plus 1-3 findings.
@@ -26,7 +25,6 @@ class ScanOptions:
     include_tests: bool
     no_cache: bool
     concurrency: int
-    max_files_per_pkg: int
     chunk_tokens: int
     chunk_overlap_tokens: int
     budget_usd: float | None = None
@@ -65,7 +63,6 @@ def prepare(paths: list[str], options: ScanOptions) -> PreparedFiles:
     files: list[PackageFile] = []
     skipped = 0
     seen_real_paths: set[Path] = set()
-    per_package_count: dict[tuple[str, str, str], int] = {}
 
     for file in discovered:
         if file.real_path in seen_real_paths:
@@ -79,14 +76,7 @@ def prepare(paths: list[str], options: ScanOptions) -> PreparedFiles:
                 errors.append(f"{file.abs_path}: {reason}")
             continue
 
-        package_key = (file.ecosystem, file.package, file.version)
-        current_count = per_package_count.get(package_key, 0)
-        if current_count >= options.max_files_per_pkg:
-            skipped += 1
-            continue
-
         seen_real_paths.add(file.real_path)
-        per_package_count[package_key] = current_count + 1
         files.append(file)
 
     return PreparedFiles(files, skipped, errors)
@@ -96,7 +86,7 @@ def estimate(files: list[PackageFile], *, model: str, options: ScanOptions, skip
     input_tokens = 0
     output_tokens = 0
     chunk_count = 0
-    packages = {(file.ecosystem, file.package, file.version) for file in files}
+    packages = {(file.package, file.version) for file in files}
 
     for file in files:
         try:
@@ -227,7 +217,6 @@ def _prompt(file: PackageFile, chunk: Chunk) -> str:
     return build_user_prompt(
         package=file.package,
         version=file.version,
-        ecosystem=file.ecosystem,
         path=str(file.rel_path),
         chunk_index=chunk.index,
         chunk_count=chunk.total,
@@ -236,24 +225,23 @@ def _prompt(file: PackageFile, chunk: Chunk) -> str:
 
 
 def _package_reports(files: list[FileReport]) -> list[PackageReport]:
-    grouped: dict[tuple[str, str, str], list[FileReport]] = {}
+    grouped: dict[tuple[str, str], list[FileReport]] = {}
     for file in files:
-        key = (file.file.ecosystem, file.file.package, file.file.version)
+        key = (file.file.package, file.file.version)
         grouped.setdefault(key, []).append(file)
 
     reports: list[PackageReport] = []
-    for (ecosystem, package, version), file_reports in grouped.items():
+    for (package, version), file_reports in grouped.items():
         verdict = merge_verdicts([file.verdict for file in file_reports])
         reports.append(
             PackageReport(
-                ecosystem=cast("Ecosystem", ecosystem),
                 package=package,
                 version=version,
                 files=file_reports,
                 verdict=verdict,
             )
         )
-    return sorted(reports, key=lambda report: (report.ecosystem, report.package, report.version))
+    return sorted(reports, key=lambda report: (report.package, report.version))
 
 
 class Progress:
diff --git a/cull/llm/prompts.py b/cull/llm/prompts.py
@@ -3,7 +3,7 @@
 import json
 from pathlib import Path
 
-PROMPT_VERSION = "2026-04-26.1"
+PROMPT_VERSION = "2026-04-27.1"
 
 SYSTEM_PROMPT = """You are cull, a defensive malware scanner for npm and Python packages.
 
@@ -25,7 +25,6 @@
 """
 
 USER_PROMPT_TEMPLATE = """Package: {package}@{version}
-Ecosystem: {ecosystem}
 File: {path}
 Chunk: {chunk_index}/{chunk_count}
 
@@ -43,7 +42,6 @@ def build_user_prompt(
     *,
     package: str,
     version: str,
-    ecosystem: str,
     path: str,
     chunk_index: int,
     chunk_count: int,
@@ -52,7 +50,6 @@ def build_user_prompt(
     return USER_PROMPT_TEMPLATE.format(
         package=package,
         version=version,
-        ecosystem=ecosystem,
         path=path,
         chunk_index=chunk_index,
         chunk_count=chunk_count,
diff --git a/cull/llm/schema.py b/cull/llm/schema.py
@@ -7,7 +7,6 @@
 
 Level = Literal["clean", "suspicious", "malicious", "error"]
 Confidence = Literal["low", "medium", "high"]
-Ecosystem = Literal["npm", "python"]
 
 INDICATORS = {
     "obfuscation",
@@ -33,7 +32,6 @@
 
 @dataclass(frozen=True)
 class PackageFile:
-    ecosystem: Ecosystem
     package: str
     version: str
     package_root: Path
@@ -78,7 +76,6 @@ class FileReport:
 
     def to_dict(self) -> dict[str, Any]:
         return {
-            "ecosystem": self.file.ecosystem,
             "package": self.file.package,
             "version": self.file.version,
             "path": str(self.file.rel_path),
@@ -91,15 +88,13 @@ def to_dict(self) -> dict[str, Any]:
 
 @dataclass(frozen=True)
 class PackageReport:
-    ecosystem: Ecosystem
     package: str
     version: str
     files: list[FileReport]
     verdict: Verdict
 
     def to_dict(self) -> dict[str, Any]:
         return {
-            "ecosystem": self.ecosystem,
             "package": self.package,
             "version": self.version,
             "verdict": self.verdict.to_dict(),
diff --git a/tests/test_llm_scan.py b/tests/test_llm_scan.py
@@ -109,7 +109,6 @@ def _make_file(self, root: Path, rel: str, content: bytes = b"x = 1\n") -> Packa
         path.parent.mkdir(parents=True, exist_ok=True)
         path.write_bytes(content)
         return PackageFile(
-            ecosystem="npm",
             package="demo",
             version="1.0.0",
             package_root=root,
@@ -248,7 +247,6 @@ def test_node_modules_discovery_and_estimate(self):
                 include_tests=False,
                 no_cache=True,
                 concurrency=1,
-                max_files_per_pkg=200,
                 chunk_tokens=4000,
                 chunk_overlap_tokens=600,
                 progress=False,
@@ -262,6 +260,23 @@ def test_node_modules_discovery_and_estimate(self):
             self.assertEqual(result.chunk_count, 2)
             self.assertIsNotNone(result.estimated_cost_usd)
 
+    def test_dir_mode_falls_back_for_unrecognized_folders(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            root = Path(tmp) / "myproj"
+            (root / "src").mkdir(parents=True)
+            (root / ".git").mkdir()
+            (root / ".git" / "config").write_text("[core]\n", encoding="utf-8")
+            (root / "src" / "main.js").write_text("console.log('hi')\n", encoding="utf-8")
+            (root / "src" / "lib.py").write_text("print('hi')\n", encoding="utf-8")
+            (root / "README.md").write_text("# hi\n", encoding="utf-8")
+
+            files, errors = discover([str(root)])
+            self.assertFalse(errors)
+            self.assertTrue(files)
+            self.assertTrue(all(file.package == "myproj" for file in files))
+            # .git is pure metadata: skipped at walk time, not at filter time.
+            self.assertFalse(any(".git" in file.rel_path.parts for file in files))
+
 
 if __name__ == "__main__":
     unittest.main()