|
1 | 1 | import asyncio |
2 | 2 | import dataclasses |
| 3 | +import hashlib |
3 | 4 | from collections import defaultdict |
4 | 5 | from collections.abc import Callable, Iterable |
5 | 6 | from functools import cached_property |
@@ -66,18 +67,31 @@ def files(self) -> list[File]: |
66 | 67 | keep_duplicates=False, |
67 | 68 | ) |
68 | 69 |
|
69 | | - def _get_local_path(self, file: File) -> Path: |
| 70 | + def get_local_path(self, file: File) -> Path: |
70 | 71 | return self._client.fs.paths.data / file.local_path / file.filename |
71 | 72 |
|
| 73 | + @property |
| 74 | + def missing_files(self) -> list[File]: |
| 75 | + missing_files = [] |
| 76 | + for file in self.files: |
| 77 | + path = self.get_local_path(file) |
| 78 | + if path.exists(): |
| 79 | + with path.open("rb") as f: |
| 80 | + digest = hashlib.file_digest(f, file.checksum_type) |
| 81 | + if digest.hexdigest() == file.checksum: |
| 82 | + continue |
| 83 | + missing_files.append(file) |
| 84 | + return missing_files |
| 85 | + |
72 | 86 | @cached_property |
73 | 87 | def local_paths(self) -> dict[str, list[Path]]: |
74 | 88 | datasets = defaultdict(list) |
75 | 89 | for file in self.files: |
76 | | - datasets[file.dataset_id].append(self._get_local_path(file)) |
| 90 | + datasets[file.dataset_id].append(self.get_local_path(file)) |
77 | 91 | return dict(datasets) |
78 | 92 |
|
79 | 93 | def download(self) -> None: |
80 | | - _, errors = asyncio.run(self._client.download(self.files, use_db=False)) |
| 94 | + _, errors = asyncio.run(self._client.download(self.missing_files, use_db=False)) |
81 | 95 | exceptions = [] |
82 | 96 | for error in errors: |
83 | 97 | err = error.err |
|
0 commit comments