Skip to content

Commit ea1b127

Browse files
committed
refactor(scan): drop ecosystem field, fall back to dir-mode for any folder
- discover(): if a path doesn't look like node_modules or site-packages, walk it as a single anonymous package instead of erroring out. Refines SKIP_DIRS to pure-metadata dirs only — .venv, node_modules, etc. stay in scope because malware lives there by design. - Drop the `ecosystem` field everywhere (schema, orchestrator, CLI display, prompt template, tests). Discovery layout no longer needs to be carried into reports; package@version is enough. - Drop --max-files-per-pkg. --budget-usd plus the preflight estimate already cover cost control, and the per-package cap conflicted with dir-mode. - Bump PROMPT_VERSION to invalidate cached verdicts produced under the old user-prompt format. - README: document the new discovery model + dir-mode fallback. - Test: cover dir-mode fallback and .git exclusion. Verified end-to-end in cull-sandbox against a Datadog compromised_lib zip (@emilgroup/[email protected]): discovered → estimated $0.0023 → scanned for $0.0027 → flagged malicious with high-confidence findings (postinstall hook, systemd persistence, npm-search worm propagation, auto-publish hijack).
1 parent ddb09cb commit ea1b127

7 files changed

Lines changed: 54 additions & 55 deletions

File tree

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,13 @@ LLM-scan installed package source files for suspicious supply-chain behavior.
3737

3838
```bash
3939
export GEMINI_API_KEY=...
40-
cull scan ./node_modules
41-
cull scan ./.venv/lib/python3.12/site-packages
40+
cull scan ./node_modules # npm install dir
41+
cull scan ./.venv/lib/python3.12/site-packages # python install dir
42+
cull scan ./my-package # any folder of source code
4243
cull scan ./node_modules ./.venv/lib/python3.12/site-packages -o report.json
4344
```
4445

45-
`PATH` must point at a package install directory: `node_modules`, `site-packages`, or a directory that clearly looks like one.
46+
`cull scan` auto-detects `node_modules` and `site-packages` layouts; anything else is scanned as a single anonymous folder. Pure-metadata dirs (`.git`, `__pycache__`, lint caches) are skipped at walk time. Dependency dirs (`.venv`, nested `node_modules`) are *not*that is exactly the attack surface this tool is for. Use `--estimate-only` first to see how big the scan will be.
4647

4748
Every run prints a preflight estimate first:
4849

cull/llm/cli.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ def add_arguments(parser: argparse.ArgumentParser) -> None:
2222
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
2323
parser.add_argument("--api-key-env", default="GEMINI_API_KEY")
2424
parser.add_argument("--concurrency", type=int, default=8)
25-
parser.add_argument("--max-files-per-pkg", type=int, default=200)
2625
parser.add_argument("--chunk-tokens", type=int, default=4000)
2726
parser.add_argument("--chunk-overlap-tokens", type=int, default=600)
2827
parser.add_argument("--include-tests", action="store_true")
@@ -40,7 +39,6 @@ def run(args: argparse.Namespace) -> NoReturn:
4039
include_tests=args.include_tests,
4140
no_cache=args.no_cache,
4241
concurrency=max(1, args.concurrency),
43-
max_files_per_pkg=max(1, args.max_files_per_pkg),
4442
chunk_tokens=max(1, args.chunk_tokens),
4543
chunk_overlap_tokens=max(0, args.chunk_overlap_tokens),
4644
budget_usd=args.budget_usd,
@@ -121,7 +119,7 @@ def _print_summary(packages: list[PackageReport], cost: dict[str, int | float |
121119
tprint()
122120
for package in [*malicious, *suspicious][:10]:
123121
color = red if package.verdict.level == "malicious" else yellow
124-
tprint(color(f"▸ {package.package}@{package.version} ({package.ecosystem}) {package.verdict.level}"))
122+
tprint(color(f"▸ {package.package}@{package.version}{package.verdict.level}"))
125123
for file in [item for item in package.files if item.verdict.level != "clean"][:3]:
126124
tprint(f" {file.verdict.level} {file.file.rel_path} ({file.verdict.confidence})")
127125
for finding in file.verdict.findings[:2]:
@@ -165,7 +163,7 @@ def _markdown_report(packages: list[PackageReport], cost: dict[str, int | float
165163
for package in packages:
166164
if package.verdict.level == "clean":
167165
continue
168-
lines.append(f"## {package.package}@{package.version} ({package.ecosystem}) {package.verdict.level}")
166+
lines.append(f"## {package.package}@{package.version}{package.verdict.level}")
169167
lines.append("")
170168
for file in package.files:
171169
if file.verdict.level == "clean":

cull/llm/discover.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,17 @@
44
import json
55
import os
66
from pathlib import Path
7-
from typing import cast
87

9-
from .schema import Ecosystem, PackageFile
8+
from .schema import PackageFile
109

11-
SKIP_DIRS = {".git", ".hg", ".svn", ".cache", "__pycache__", ".pytest_cache"}
10+
# Pure-metadata directories with no scannable source. We do NOT skip dependency
11+
# dirs (.venv, node_modules, …) — malware lives in those by design and that is
12+
# precisely what cull is for.
13+
SKIP_DIRS = {
14+
".git", ".hg", ".svn",
15+
"__pycache__",
16+
".cache", ".pytest_cache", ".mypy_cache", ".ruff_cache", ".tox",
17+
}
1218

1319

1420
def discover(paths: list[str]) -> tuple[list[PackageFile], list[str]]:
@@ -22,18 +28,13 @@ def discover(paths: list[str]) -> tuple[list[PackageFile], list[str]]:
2228
errors.append(f"{raw_path}: not a directory")
2329
continue
2430

25-
matched = False
2631
if _looks_like_node_modules(root):
27-
matched = True
2832
for node_modules in _node_modules_dirs(root):
2933
files.extend(_discover_npm(node_modules, seen_roots))
30-
31-
if _looks_like_site_packages(root):
32-
matched = True
34+
elif _looks_like_site_packages(root):
3335
files.extend(_discover_python(root))
34-
35-
if not matched:
36-
errors.append(f"{raw_path}: does not look like node_modules or site-packages")
36+
else:
37+
files.extend(_discover_dir(root))
3738

3839
return files, errors
3940

@@ -77,8 +78,9 @@ def _discover_npm(node_modules: Path, seen_roots: set[Path]) -> list[PackageFile
7778
seen_roots.add(real_root)
7879

7980
package, version = _read_package_json(package_root)
80-
for path in _walk_files(package_root):
81-
files.append(_package_file("npm", package, version, package_root, path))
81+
# Skip nested node_modules: they're enumerated separately by _node_modules_dirs.
82+
for path in _walk_files(package_root, skip=SKIP_DIRS | {"node_modules"}):
83+
files.append(_package_file(package, version, package_root, path))
8284
return files
8385

8486

@@ -112,19 +114,23 @@ def _discover_python(site_packages: Path) -> list[PackageFile]:
112114
if real in seen:
113115
continue
114116
seen.add(real)
115-
files.append(_package_file("python", name, version, site_packages, path))
117+
files.append(_package_file(name, version, site_packages, path))
116118

117119
for path in site_packages.glob("*.pth"):
118120
if path.is_file() and not path.is_symlink():
119-
files.append(_package_file("python", f"pth:{path.stem}", "unknown", site_packages, path))
121+
files.append(_package_file(f"pth:{path.stem}", "unknown", site_packages, path))
120122

121123
return files
122124

123125

124-
def _walk_files(root: Path) -> list[Path]:
126+
def _discover_dir(root: Path) -> list[PackageFile]:
127+
return [_package_file(root.name, "unknown", root, path) for path in _walk_files(root)]
128+
129+
130+
def _walk_files(root: Path, *, skip: set[str] = SKIP_DIRS) -> list[Path]:
125131
paths: list[Path] = []
126132
for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
127-
dirnames[:] = [name for name in dirnames if name not in SKIP_DIRS and name != "node_modules"]
133+
dirnames[:] = [name for name in dirnames if name not in skip]
128134
current = Path(dirpath)
129135
for filename in filenames:
130136
path = current / filename
@@ -133,9 +139,8 @@ def _walk_files(root: Path) -> list[Path]:
133139
return paths
134140

135141

136-
def _package_file(ecosystem: str, package: str, version: str, root: Path, path: Path) -> PackageFile:
142+
def _package_file(package: str, version: str, root: Path, path: Path) -> PackageFile:
137143
return PackageFile(
138-
ecosystem=cast(Ecosystem, ecosystem),
139144
package=package,
140145
version=version,
141146
package_root=root,

cull/llm/orchestrator.py

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from concurrent.futures import ThreadPoolExecutor, as_completed
66
from dataclasses import dataclass
77
from pathlib import Path
8-
from typing import cast
98

109
from .cache import VerdictCache, cache_key
1110
from .chunker import Chunk, chunk_text
@@ -14,7 +13,7 @@
1413
from .filter import read_text, should_scan
1514
from .pricing import RunCost, cost_for, estimate_tokens
1615
from .prompts import PROMPT_VERSION, SYSTEM_PROMPT, build_user_prompt
17-
from .schema import Ecosystem, Estimate, FileReport, PackageFile, PackageReport, Verdict, error_verdict, merge_verdicts
16+
from .schema import Estimate, FileReport, PackageFile, PackageReport, Verdict, error_verdict, merge_verdicts
1817

1918
# Per-chunk output budget assumed when projecting cost or when a provider does
2019
# not return usage. ~180 tokens covers the JSON envelope plus 1-3 findings.
@@ -26,7 +25,6 @@ class ScanOptions:
2625
include_tests: bool
2726
no_cache: bool
2827
concurrency: int
29-
max_files_per_pkg: int
3028
chunk_tokens: int
3129
chunk_overlap_tokens: int
3230
budget_usd: float | None = None
@@ -65,7 +63,6 @@ def prepare(paths: list[str], options: ScanOptions) -> PreparedFiles:
6563
files: list[PackageFile] = []
6664
skipped = 0
6765
seen_real_paths: set[Path] = set()
68-
per_package_count: dict[tuple[str, str, str], int] = {}
6966

7067
for file in discovered:
7168
if file.real_path in seen_real_paths:
@@ -79,14 +76,7 @@ def prepare(paths: list[str], options: ScanOptions) -> PreparedFiles:
7976
errors.append(f"{file.abs_path}: {reason}")
8077
continue
8178

82-
package_key = (file.ecosystem, file.package, file.version)
83-
current_count = per_package_count.get(package_key, 0)
84-
if current_count >= options.max_files_per_pkg:
85-
skipped += 1
86-
continue
87-
8879
seen_real_paths.add(file.real_path)
89-
per_package_count[package_key] = current_count + 1
9080
files.append(file)
9181

9282
return PreparedFiles(files, skipped, errors)
@@ -96,7 +86,7 @@ def estimate(files: list[PackageFile], *, model: str, options: ScanOptions, skip
9686
input_tokens = 0
9787
output_tokens = 0
9888
chunk_count = 0
99-
packages = {(file.ecosystem, file.package, file.version) for file in files}
89+
packages = {(file.package, file.version) for file in files}
10090

10191
for file in files:
10292
try:
@@ -227,7 +217,6 @@ def _prompt(file: PackageFile, chunk: Chunk) -> str:
227217
return build_user_prompt(
228218
package=file.package,
229219
version=file.version,
230-
ecosystem=file.ecosystem,
231220
path=str(file.rel_path),
232221
chunk_index=chunk.index,
233222
chunk_count=chunk.total,
@@ -236,24 +225,23 @@ def _prompt(file: PackageFile, chunk: Chunk) -> str:
236225

237226

238227
def _package_reports(files: list[FileReport]) -> list[PackageReport]:
239-
grouped: dict[tuple[str, str, str], list[FileReport]] = {}
228+
grouped: dict[tuple[str, str], list[FileReport]] = {}
240229
for file in files:
241-
key = (file.file.ecosystem, file.file.package, file.file.version)
230+
key = (file.file.package, file.file.version)
242231
grouped.setdefault(key, []).append(file)
243232

244233
reports: list[PackageReport] = []
245-
for (ecosystem, package, version), file_reports in grouped.items():
234+
for (package, version), file_reports in grouped.items():
246235
verdict = merge_verdicts([file.verdict for file in file_reports])
247236
reports.append(
248237
PackageReport(
249-
ecosystem=cast("Ecosystem", ecosystem),
250238
package=package,
251239
version=version,
252240
files=file_reports,
253241
verdict=verdict,
254242
)
255243
)
256-
return sorted(reports, key=lambda report: (report.ecosystem, report.package, report.version))
244+
return sorted(reports, key=lambda report: (report.package, report.version))
257245

258246

259247
class Progress:

cull/llm/prompts.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import json
44
from pathlib import Path
55

6-
PROMPT_VERSION = "2026-04-26.1"
6+
PROMPT_VERSION = "2026-04-27.1"
77

88
SYSTEM_PROMPT = """You are cull, a defensive malware scanner for npm and Python packages.
99
@@ -25,7 +25,6 @@
2525
"""
2626

2727
USER_PROMPT_TEMPLATE = """Package: {package}@{version}
28-
Ecosystem: {ecosystem}
2928
File: {path}
3029
Chunk: {chunk_index}/{chunk_count}
3130
@@ -43,7 +42,6 @@ def build_user_prompt(
4342
*,
4443
package: str,
4544
version: str,
46-
ecosystem: str,
4745
path: str,
4846
chunk_index: int,
4947
chunk_count: int,
@@ -52,7 +50,6 @@ def build_user_prompt(
5250
return USER_PROMPT_TEMPLATE.format(
5351
package=package,
5452
version=version,
55-
ecosystem=ecosystem,
5653
path=path,
5754
chunk_index=chunk_index,
5855
chunk_count=chunk_count,

cull/llm/schema.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
Level = Literal["clean", "suspicious", "malicious", "error"]
99
Confidence = Literal["low", "medium", "high"]
10-
Ecosystem = Literal["npm", "python"]
1110

1211
INDICATORS = {
1312
"obfuscation",
@@ -33,7 +32,6 @@
3332

3433
@dataclass(frozen=True)
3534
class PackageFile:
36-
ecosystem: Ecosystem
3735
package: str
3836
version: str
3937
package_root: Path
@@ -78,7 +76,6 @@ class FileReport:
7876

7977
def to_dict(self) -> dict[str, Any]:
8078
return {
81-
"ecosystem": self.file.ecosystem,
8279
"package": self.file.package,
8380
"version": self.file.version,
8481
"path": str(self.file.rel_path),
@@ -91,15 +88,13 @@ def to_dict(self) -> dict[str, Any]:
9188

9289
@dataclass(frozen=True)
9390
class PackageReport:
94-
ecosystem: Ecosystem
9591
package: str
9692
version: str
9793
files: list[FileReport]
9894
verdict: Verdict
9995

10096
def to_dict(self) -> dict[str, Any]:
10197
return {
102-
"ecosystem": self.ecosystem,
10398
"package": self.package,
10499
"version": self.version,
105100
"verdict": self.verdict.to_dict(),

tests/test_llm_scan.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ def _make_file(self, root: Path, rel: str, content: bytes = b"x = 1\n") -> Packa
109109
path.parent.mkdir(parents=True, exist_ok=True)
110110
path.write_bytes(content)
111111
return PackageFile(
112-
ecosystem="npm",
113112
package="demo",
114113
version="1.0.0",
115114
package_root=root,
@@ -248,7 +247,6 @@ def test_node_modules_discovery_and_estimate(self):
248247
include_tests=False,
249248
no_cache=True,
250249
concurrency=1,
251-
max_files_per_pkg=200,
252250
chunk_tokens=4000,
253251
chunk_overlap_tokens=600,
254252
progress=False,
@@ -262,6 +260,23 @@ def test_node_modules_discovery_and_estimate(self):
262260
self.assertEqual(result.chunk_count, 2)
263261
self.assertIsNotNone(result.estimated_cost_usd)
264262

263+
def test_dir_mode_falls_back_for_unrecognized_folders(self):
264+
with tempfile.TemporaryDirectory() as tmp:
265+
root = Path(tmp) / "myproj"
266+
(root / "src").mkdir(parents=True)
267+
(root / ".git").mkdir()
268+
(root / ".git" / "config").write_text("[core]\n", encoding="utf-8")
269+
(root / "src" / "main.js").write_text("console.log('hi')\n", encoding="utf-8")
270+
(root / "src" / "lib.py").write_text("print('hi')\n", encoding="utf-8")
271+
(root / "README.md").write_text("# hi\n", encoding="utf-8")
272+
273+
files, errors = discover([str(root)])
274+
self.assertFalse(errors)
275+
self.assertTrue(files)
276+
self.assertTrue(all(file.package == "myproj" for file in files))
277+
# .git is pure metadata: skipped at walk time, not at filter time.
278+
self.assertFalse(any(".git" in file.rel_path.parts for file in files))
279+
265280

266281
if __name__ == "__main__":
267282
unittest.main()

0 commit comments

Comments
 (0)