Skip to content

Commit 73abe0d

Browse files
authored
Merge pull request #282 from scipp/pooch-untar
Support untar and multiple files in data registry
2 parents fbe5d45 + fd4cb8d commit 73abe0d

File tree

1 file changed

+110
-44
lines changed

1 file changed

+110
-44
lines changed

src/ess/reduce/data/_registry.py

Lines changed: 110 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@
33

44
from __future__ import annotations
55

6+
import dataclasses
67
import hashlib
78
import os
89
from abc import ABC, abstractmethod
910
from collections.abc import Mapping
10-
from dataclasses import dataclass
1111
from functools import cache
1212
from pathlib import Path
13-
from typing import Any
13+
from typing import Any, Literal
1414

1515
_LOCAL_CACHE_ENV_VAR = "SCIPP_DATA_DIR"
1616
_LOCAL_REGISTRY_ENV_VAR = "SCIPP_OVERRIDE_DATA_DIR"
@@ -28,8 +28,8 @@ def make_registry(
2828
2929
By default, this function creates a :class:`PoochRegistry` to download files
3030
via HTTP from an online file store.
31-
This can be overridden by setting the environment variable `SCIPP_DATA_DIR` to a
32-
path on the local file system.
31+
This can be overridden by setting the environment variable
32+
``SCIPP_OVERRIDE_DATA_DIR`` to a path on the local file system.
3333
In this case, a :class:`LocalRegistry` is returned.
3434
3535
Files are specified as a dict using either the Pooch string format explicitly
@@ -40,7 +40,11 @@ def make_registry(
4040
... "file1.dat": "md5:1234567890abcdef",
4141
... "file2.csv": Entry(alg="md5", chk="abcdef123456789"),
4242
... "folder/nested.dat": "blake2b:1234567890abcdef",
43-
... "zipped.zip": Entry(alg="blake2b", chk="abcdef123456789", unzip=True),
43+
... "zipped.zip": Entry(
44+
... alg="blake2b",
45+
... chk="abcdef123456789",
46+
... extractor="unzip"
47+
... ),
4448
... }
4549
4650
In the example above, the specifications for ``file1.dat`` and ``file2.csv`` are
@@ -49,10 +53,11 @@ def make_registry(
4953
Paths like this must always use forward slashes (/) even on Windows.
5054
5155
As shown above, it is possible to automatically unzip
52-
files by specifying ``unzip=True``.
56+
files by specifying ``extractor="unzip"``.
5357
When calling ``registry.get_path("zipped.zip")`` the file will be unzipped and
5458
a path to the content is returned.
55-
This expects that there is only a single file in the zip archive.
59+
Similarly, ``extractor="untar"`` specifies that a file needs to be untarred
60+
(and possibly un-gzipped).
5661
5762
The complete path to the source file is constructed as follows:
5863
@@ -111,17 +116,26 @@ def _check_local_override_path(override: str) -> Path:
111116
return path
112117

113118

114-
@dataclass(frozen=True, slots=True)
119+
@dataclasses.dataclass(frozen=True, slots=True)
115120
class Entry:
116121
"""An entry in a registry."""
117122

118123
chk: str
119124
"""Checksum."""
120125
alg: str
121126
"""Checksum algorithm."""
122-
unzip: bool = False
127+
extractor: Literal["unzip", "untar"] | None = None
128+
"""Processor to extract file contents."""
129+
130+
unzip: dataclasses.InitVar[bool] = False
123131
"""Whether to unzip the file."""
124132

133+
def __post_init__(self, unzip: bool) -> None:
134+
if self.extractor is not None and unzip:
135+
raise TypeError("Set either the 'unzip' argument or 'extractor', not both.")
136+
if self.extractor is None and unzip:
137+
super().__setattr__("extractor", "unzip")
138+
125139
@classmethod
126140
def from_pooch_string(cls, pooch_string: str) -> Entry:
127141
alg, chk = pooch_string.split(":")
@@ -132,7 +146,7 @@ class Registry(ABC):
132146
def __init__(self, files: Mapping[str, str | Entry]) -> None:
133147
self._files = _to_file_entries(files)
134148

135-
@abstractmethod
149+
@cache # noqa: B019
136150
def get_path(self, name: str) -> Path:
137151
"""Get the path to a file in the registry.
138152
@@ -154,9 +168,60 @@ def get_path(self, name: str) -> Path:
154168
:
155169
The Path to the file.
156170
"""
171+
return Path(
172+
_expect_single(
173+
self._fetch(name, extractor=self._extractor_processor(name)),
174+
name,
175+
)
176+
)
157177

158-
def _needs_unzip(self, name: str) -> bool:
159-
return self._files[name].unzip
178+
@cache # noqa: B019
179+
def get_paths(self, name: str) -> list[Path]:
180+
"""Get the paths to unpacked files from the registry.
181+
182+
This method downloads the given file, extracts its contents, and returns
183+
the paths to all extracted contents.
184+
Unlike :meth:`get_path`, this method requires an extractor processor
185+
(unzip or untar).
186+
187+
Depending on the implementation, the file is downloaded if necessary.
188+
189+
Note that implementations are allowed to cache return values of this method
190+
to avoid recomputing potentially expensive checksums.
191+
This usually means that the ``Registry`` object itself gets stored until the
192+
Python interpreter shuts down.
193+
However, registries are small and do not own resources.
194+
195+
Parameters
196+
----------
197+
name:
198+
Name of the zipped or tarred file to get the path for.
199+
200+
Returns
201+
-------
202+
:
203+
The Paths to the files.
204+
"""
205+
if (extractor := self._extractor_processor(name)) is None:
206+
raise ValueError(f"File '{name}' is not zipped or tarred.")
207+
return [Path(path) for path in self._fetch(name, extractor=extractor)]
208+
209+
def _extractor_processor_type(self, name: str) -> Any:
210+
match self._files[name].extractor:
211+
case "unzip":
212+
return _pooch_unzip_processor_class()
213+
case "untar":
214+
return _pooch_untar_processor_class()
215+
case None:
216+
return None
217+
218+
@abstractmethod
219+
def _extractor_processor(self, name: str) -> Any:
220+
"""Return an instance of a processor for the given file."""
221+
222+
@abstractmethod
223+
def _fetch(self, name: str, extractor: Any) -> list[str] | str:
224+
"""Fetch the given file from the registry."""
160225

161226

162227
class PoochRegistry(Registry):
@@ -178,24 +243,15 @@ def __init__(
178243
)
179244
super().__init__(files)
180245

181-
@cache # noqa: B019
182-
def get_path(self, name: str) -> Path:
183-
"""Get the path to a file in the registry.
184-
185-
Downloads the file if necessary.
186-
"""
187-
if self._needs_unzip(name):
188-
paths: list[str] = self._registry.fetch( # type: ignore[assignment]
189-
name, processor=self._unzip_processor
190-
)
191-
return Path(_expect_single_unzipped(paths, name))
192-
return Path(self._registry.fetch(name))
246+
def _fetch(self, name: str, extractor: Any) -> list[str] | str:
247+
return self._registry.fetch(name, processor=extractor)
193248

194-
@property
195-
def _unzip_processor(self) -> Any:
249+
def _extractor_processor(self, name: str) -> Any:
196250
# Create a new processor on demand because reusing the same processor would
197251
# reuse the same output path for every file.
198-
return _import_pooch().Unzip()
252+
if (cls := self._extractor_processor_type(name=name)) is not None:
253+
return cls()
254+
return None
199255

200256

201257
class LocalRegistry(Registry):
@@ -217,12 +273,11 @@ def __init__(
217273
base_url=base_url,
218274
retry_if_failed=retry_if_failed,
219275
)
220-
self._extract_dir = pooch_registry.path
276+
self._extract_base_dir = pooch_registry.path
221277
self._source_path = source_path.resolve().joinpath(*prefix.split("/"), version)
222278
super().__init__(files)
223279

224-
@cache # noqa: B019
225-
def get_path(self, name: str) -> Path:
280+
def _fetch(self, name: str, extractor: Any) -> list[str] | str:
226281
"""Get the path to a file in the registry."""
227282
try:
228283
entry = self._files[name]
@@ -238,24 +293,24 @@ def get_path(self, name: str) -> Path:
238293

239294
_check_hash(name, path, entry)
240295

241-
if self._needs_unzip(name):
242-
return Path(
243-
_expect_single_unzipped(
244-
self._unzip_processor(os.fspath(path), "download", None), path
245-
)
246-
)
247-
return path
296+
if extractor is not None:
297+
return extractor(os.fspath(path), "download", None)
298+
return os.fspath(path)
248299

249300
def _local_path(self, name: str) -> Path:
250301
# Split on "/" because `name` is always a POSIX-style path, but the return
251302
# value is a system path, i.e., it can be a Windows-style path.
252303
return self._source_path.joinpath(*name.split("/"))
253304

254-
@property
255-
def _unzip_processor(self) -> Any:
305+
def _extract_dir(self, name: str) -> Path:
306+
return self._extract_base_dir / name
307+
308+
def _extractor_processor(self, name: str) -> Any:
256309
# Create a new processor on demand because reusing the same processor would
257310
# reuse the same output path for every file.
258-
return _import_pooch().Unzip(self._extract_dir)
311+
if (cls := self._extractor_processor_type(name=name)) is not None:
312+
return cls(extract_dir=self._extract_dir(name))
313+
return None
259314

260315

261316
def _import_pooch() -> Any:
@@ -288,19 +343,30 @@ def _create_pooch(
288343
)
289344

290345

291-
def _pooch_unzip_processor(extract_dir: Path) -> Any:
346+
def _pooch_unzip_processor_class() -> Any:
292347
try:
293348
import pooch
294349
except ImportError:
295350
raise ImportError("You need to install Pooch to unzip files.") from None
296351

297-
return pooch.processors.Unzip(extract_dir=os.fspath(extract_dir))
352+
return pooch.processors.Unzip
353+
354+
355+
def _pooch_untar_processor_class() -> Any:
356+
try:
357+
import pooch
358+
except ImportError:
359+
raise ImportError("You need to install Pooch to untar files.") from None
360+
361+
return pooch.processors.Untar
298362

299363

300-
def _expect_single_unzipped(paths: list[str], archive: str | os.PathLike) -> str:
364+
def _expect_single(paths: list[str] | str, archive: str | os.PathLike) -> str:
365+
if isinstance(paths, str):
366+
return paths
301367
if len(paths) != 1:
302368
raise ValueError(
303-
f"Expected exactly one file to unzip, got {len(paths)} in "
369+
f"Expected exactly one extracted file, got {len(paths)} in "
304370
f"'{os.fspath(archive)}'."
305371
)
306372
return paths[0]

0 commit comments

Comments
 (0)