Skip to content

Commit 75524e9

Browse files
committed
Add --missing-file-content option to handle broken symlinks in validate
When running `dandi validate` on a datalad dataset without fetched data, broken symlinks cause verbose exception tracebacks for every file (issue #1606). This adds a --missing-file-content option with three policies: - error (default): emit a concise single-line error per broken symlink - skip: skip the file entirely with a WARNING - only-non-data: skip content-dependent validators (pynwb, nwbinspector) but still validate path layout The MissingFileContent enum is threaded from the CLI through the validate pipeline to individual DandiFile.get_validation_errors() implementations. Closes #1606 https://claude.ai/code/session_01CLi49c7QcJx11b7UfshbvE
1 parent 0af950f commit 75524e9

File tree

9 files changed

+362
-64
lines changed

9 files changed

+362
-64
lines changed

dandi/cli/cmd_validate.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
validation_companion_path,
2222
write_validation_jsonl,
2323
)
24-
from ..validate._types import Severity, ValidationResult
24+
from ..validate._types import MissingFileContent, Severity, ValidationResult
2525

2626
lgr = logging.getLogger(__name__)
2727

@@ -56,6 +56,7 @@ def _collect_results(
5656
schema: str | None,
5757
devel_debug: bool,
5858
allow_any_path: bool,
59+
missing_file_content: MissingFileContent = MissingFileContent.error,
5960
) -> list[ValidationResult]:
6061
"""Run validation and collect all results into a list."""
6162
# Avoid heavy import by importing within function:
@@ -81,6 +82,7 @@ def _collect_results(
8182
schema_version=schema,
8283
devel_debug=devel_debug,
8384
allow_any_path=allow_any_path,
85+
missing_file_content=missing_file_content,
8486
)
8587
)
8688

@@ -208,6 +210,17 @@ def validate_bids(
208210
help="Limit results per group (or total if ungrouped). "
209211
"Excess results are replaced by a count of omitted items.",
210212
)
213+
@click.option(
214+
"--missing-file-content",
215+
"missing_file_content",
216+
help="How to handle files whose content is unavailable (e.g. broken symlinks "
217+
"in a datalad dataset without fetched data). 'error' (default) emits a "
218+
"concise error per file, 'skip' skips each such file with a warning, "
219+
"'only-non-data' skips content-dependent validators but still validates "
220+
"path layout.",
221+
type=click.Choice(["error", "only-non-data", "skip"], case_sensitive=True),
222+
default="error",
223+
)
211224
@click.option(
212225
"--load",
213226
help="Load validation results from JSONL file(s) instead of running validation.",
@@ -229,6 +242,7 @@ def validate(
229242
output_file: str | None = None,
230243
summary: bool = False,
231244
max_per_group: int | None = None,
245+
missing_file_content: str = "error",
232246
load: tuple[str, ...] = (),
233247
schema: str | None = None,
234248
devel_debug: bool = False,
@@ -270,7 +284,10 @@ def validate(
270284
if load:
271285
results = load_validation_jsonl(load)
272286
else:
273-
results = _collect_results(paths, schema, devel_debug, allow_any_path)
287+
mfc = MissingFileContent(missing_file_content)
288+
results = _collect_results(
289+
paths, schema, devel_debug, allow_any_path, missing_file_content=mfc
290+
)
274291
# Auto-save companion right after collection, before filtering — so
275292
# all results are preserved regardless of display filters.
276293
# Skip when writing to --output (user already gets structured output).

dandi/cli/tests/test_cmd_validate.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -896,3 +896,73 @@ def test_validate_auto_companion_structured_stdout(
896896
assert r.exit_code == 1
897897

898898
assert len(list(redirected_logdir.glob("*_validation.jsonl"))) == 1
899+
900+
901+
# ---- Tests for --missing-file-content option (issue #1606) ----
902+
903+
904+
def _make_dandiset_with_broken_symlinks(tmp_path: Path) -> Path:
905+
"""Create a minimal dandiset with broken NWB symlinks (simulating datalad)."""
906+
from ...consts import dandiset_metadata_file
907+
908+
(tmp_path / dandiset_metadata_file).write_text(
909+
"identifier: '000027'\nname: Test\ndescription: Test dandiset\n"
910+
)
911+
for name in ("sub-001/sub-001.nwb", "sub-002/sub-002.nwb"):
912+
p = tmp_path / name
913+
p.parent.mkdir(parents=True, exist_ok=True)
914+
p.symlink_to("/nonexistent/annex/target.nwb")
915+
return tmp_path
916+
917+
918+
@pytest.mark.ai_generated
919+
def test_validate_missing_file_content_error_default(tmp_path: Path) -> None:
920+
"""Default --missing-file-content=error emits concise errors for broken symlinks."""
921+
ds = _make_dandiset_with_broken_symlinks(tmp_path)
922+
r = CliRunner().invoke(validate, [str(ds)])
923+
assert r.exit_code == 1
924+
assert "FILE_CONTENT_MISSING" in r.output
925+
assert "broken symlink" in r.output.lower()
926+
# No Python tracebacks in output
927+
assert "Traceback" not in r.output
928+
929+
930+
@pytest.mark.ai_generated
931+
def test_validate_missing_file_content_skip(tmp_path: Path) -> None:
932+
"""--missing-file-content=skip skips broken symlinks with a WARNING."""
933+
ds = _make_dandiset_with_broken_symlinks(tmp_path)
934+
r = CliRunner().invoke(validate, ["--missing-file-content", "skip", str(ds)])
935+
# Only warnings, no errors -> exit 0
936+
assert r.exit_code == 0
937+
assert "FILE_CONTENT_MISSING_SKIPPED" in r.output
938+
assert "skipped" in r.output.lower()
939+
940+
941+
@pytest.mark.ai_generated
942+
def test_validate_missing_file_content_only_non_data(tmp_path: Path) -> None:
943+
"""--missing-file-content=only-non-data validates path layout only."""
944+
ds = _make_dandiset_with_broken_symlinks(tmp_path)
945+
r = CliRunner().invoke(
946+
validate, ["--missing-file-content", "only-non-data", str(ds)]
947+
)
948+
assert "FILE_CONTENT_MISSING_PARTIAL" in r.output
949+
# No pynwb tracebacks
950+
assert "Traceback" not in r.output
951+
952+
953+
@pytest.mark.ai_generated
954+
def test_validate_missing_file_content_no_broken_symlinks(tmp_path: Path) -> None:
955+
"""--missing-file-content has no effect on normal files (no broken symlinks)."""
956+
from ...consts import dandiset_metadata_file
957+
958+
(tmp_path / dandiset_metadata_file).write_text(
959+
"identifier: '000027'\nname: Test\ndescription: Test dandiset\n"
960+
)
961+
r_default = CliRunner().invoke(validate, [str(tmp_path)])
962+
r_skip = CliRunner().invoke(
963+
validate, ["--missing-file-content", "skip", str(tmp_path)]
964+
)
965+
# Both should succeed identically (no broken symlinks)
966+
assert r_default.exit_code == r_skip.exit_code == 0
967+
assert "FILE_CONTENT_MISSING" not in r_default.output
968+
assert "FILE_CONTENT_MISSING" not in r_skip.output

dandi/files/bases.py

Lines changed: 95 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
from dandi.validate._types import (
3333
ORIGIN_INTERNAL_DANDI,
3434
ORIGIN_VALIDATION_DANDI,
35+
ORIGIN_VALIDATION_DANDI_LAYOUT,
36+
MissingFileContent,
3537
Origin,
3638
OriginType,
3739
Scope,
@@ -91,9 +93,18 @@ def get_validation_errors(
9193
self,
9294
schema_version: str | None = None,
9395
devel_debug: bool = False,
96+
missing_file_content: MissingFileContent | None = None,
9497
) -> list[ValidationResult]:
9598
"""
9699
Attempt to validate the file and return a list of errors encountered
100+
101+
Parameters
102+
----------
103+
missing_file_content : MissingFileContent | None
104+
When set (not None), the file's content is known to be unavailable
105+
(e.g. broken symlink). Validators should adjust behaviour
106+
accordingly — for ``only-non-data`` they should skip
107+
content-dependent checks and emit a WARNING.
97108
"""
98109
...
99110

@@ -116,6 +127,7 @@ def get_validation_errors(
116127
self,
117128
schema_version: str | None = None,
118129
devel_debug: bool = False,
130+
missing_file_content: MissingFileContent | None = None,
119131
) -> list[ValidationResult]:
120132
with open(self.filepath) as f:
121133
meta = yaml_load(f, typ="safe")
@@ -185,7 +197,14 @@ def get_validation_errors(
185197
self,
186198
schema_version: str | None = None,
187199
devel_debug: bool = False,
200+
missing_file_content: MissingFileContent | None = None,
188201
) -> list[ValidationResult]:
202+
# When file content is unavailable and policy is only-non-data,
203+
# skip metadata extraction (which requires reading the file) and
204+
# only do path-based validation via subclass hooks.
205+
if missing_file_content == MissingFileContent.only_non_data:
206+
return []
207+
189208
current_version = get_schema_version()
190209
if schema_version is None:
191210
schema_version = current_version
@@ -511,81 +530,101 @@ def get_validation_errors(
511530
self,
512531
schema_version: str | None = None,
513532
devel_debug: bool = False,
533+
missing_file_content: MissingFileContent | None = None,
514534
) -> list[ValidationResult]:
515535
"""
516536
Validate NWB asset
517537
518538
If ``schema_version`` was provided, we only validate basic metadata,
519-
and completely skip validation using nwbinspector.inspect_nwbfile
520-
"""
521-
# Avoid heavy import by importing within function:
522-
from nwbinspector import Importance, inspect_nwbfile, load_config
539+
and completely skip validation using nwbinspector.inspect_nwbfile.
523540
524-
# Avoid heavy import by importing within function:
525-
from dandi.pynwb_utils import validate as pynwb_validate
541+
If ``missing_file_content`` is ``only-non-data``, content-dependent
542+
validators (pynwb, nwbinspector) are skipped and only path-layout
543+
validation is performed.
544+
"""
545+
errors: list[ValidationResult] = []
526546

527-
errors: list[ValidationResult] = pynwb_validate(
528-
self.filepath, devel_debug=devel_debug
529-
)
530-
if schema_version is not None:
531-
errors.extend(
532-
super().get_validation_errors(
533-
schema_version=schema_version, devel_debug=devel_debug
547+
if missing_file_content == MissingFileContent.only_non_data:
548+
# Skip content-dependent validators; emit a warning
549+
errors.append(
550+
ValidationResult(
551+
id="DANDI.FILE_CONTENT_MISSING_PARTIAL",
552+
origin=ORIGIN_VALIDATION_DANDI_LAYOUT,
553+
severity=Severity.WARNING,
554+
scope=Scope.FILE,
555+
path=self.filepath,
556+
dandiset_path=self.dandiset_path,
557+
message=(
558+
"File content is not available; "
559+
"skipping content-dependent validators "
560+
"(pynwb, nwbinspector). Only path layout is validated."
561+
),
534562
)
535563
)
536564
else:
537-
# make sure that we have some basic metadata fields we require
538-
try:
539-
origin_validation_nwbinspector = Origin(
540-
type=OriginType.VALIDATION,
541-
validator=Validator.nwbinspector,
542-
validator_version=str(_get_nwb_inspector_version()),
565+
# Avoid heavy import by importing within function:
566+
from nwbinspector import Importance, inspect_nwbfile, load_config
567+
568+
# Avoid heavy import by importing within function:
569+
from dandi.pynwb_utils import validate as pynwb_validate
570+
571+
errors.extend(pynwb_validate(self.filepath, devel_debug=devel_debug))
572+
if schema_version is not None:
573+
errors.extend(
574+
super().get_validation_errors(
575+
schema_version=schema_version, devel_debug=devel_debug
576+
)
543577
)
578+
else:
579+
# make sure that we have some basic metadata fields we require
580+
try:
581+
origin_validation_nwbinspector = Origin(
582+
type=OriginType.VALIDATION,
583+
validator=Validator.nwbinspector,
584+
validator_version=str(_get_nwb_inspector_version()),
585+
)
544586

545-
for error in inspect_nwbfile(
546-
nwbfile_path=self.filepath,
547-
skip_validate=True,
548-
config=load_config(filepath_or_keyword="dandi"),
549-
importance_threshold=Importance.BEST_PRACTICE_VIOLATION,
550-
# we might want to switch to a lower threshold once nwbinspector
551-
# upstream reporting issues are clarified:
552-
# https://github.com/dandi/dandi-cli/pull/1162#issuecomment-1322238896
553-
# importance_threshold=Importance.BEST_PRACTICE_SUGGESTION,
554-
):
555-
severity = NWBI_IMPORTANCE_TO_DANDI_SEVERITY[error.importance.name]
556-
kw: Any = {}
557-
if error.location:
558-
kw["within_asset_paths"] = {
559-
error.file_path: error.location,
560-
}
561-
errors.append(
562-
ValidationResult(
563-
origin=origin_validation_nwbinspector,
564-
severity=severity,
565-
id=f"NWBI.{error.check_function_name}",
566-
scope=Scope.FILE,
567-
origin_result=error,
568-
path=Path(error.file_path),
569-
message=error.message,
570-
# Assuming multiple sessions per multiple subjects,
571-
# otherwise nesting level might differ
572-
dataset_path=Path(error.file_path).parent.parent, # TODO
573-
dandiset_path=Path(error.file_path).parent, # TODO
574-
**kw,
587+
for error in inspect_nwbfile(
588+
nwbfile_path=self.filepath,
589+
skip_validate=True,
590+
config=load_config(filepath_or_keyword="dandi"),
591+
importance_threshold=Importance.BEST_PRACTICE_VIOLATION,
592+
):
593+
severity = NWBI_IMPORTANCE_TO_DANDI_SEVERITY[
594+
error.importance.name
595+
]
596+
kw: Any = {}
597+
if error.location:
598+
kw["within_asset_paths"] = {
599+
error.file_path: error.location,
600+
}
601+
errors.append(
602+
ValidationResult(
603+
origin=origin_validation_nwbinspector,
604+
severity=severity,
605+
id=f"NWBI.{error.check_function_name}",
606+
scope=Scope.FILE,
607+
origin_result=error,
608+
path=Path(error.file_path),
609+
message=error.message,
610+
dataset_path=Path(error.file_path).parent.parent,
611+
dandiset_path=Path(error.file_path).parent,
612+
**kw,
613+
)
575614
)
615+
except Exception as e:
616+
if devel_debug:
617+
raise
618+
# TODO: might reraise instead of making it into an error
619+
return _pydantic_errors_to_validation_results(
620+
[e], self.filepath, scope=Scope.FILE
576621
)
577-
except Exception as e:
578-
if devel_debug:
579-
raise
580-
# TODO: might reraise instead of making it into an error
581-
return _pydantic_errors_to_validation_results(
582-
[e], self.filepath, scope=Scope.FILE
583-
)
584622

585623
# Avoid circular imports by importing within function:
586624
from .bids import NWBBIDSAsset
587625
from ..organize import validate_organized_path
588626

627+
# Path-layout validation always runs (doesn't need content)
589628
if not isinstance(self, NWBBIDSAsset) and self.dandiset_path is not None:
590629
errors.extend(
591630
validate_organized_path(self.path, self.filepath, self.dandiset_path)

0 commit comments

Comments
 (0)