33
44from __future__ import annotations
55
6+ import dataclasses
67import hashlib
78import os
89from abc import ABC , abstractmethod
910from collections .abc import Mapping
10- from dataclasses import dataclass
1111from functools import cache
1212from pathlib import Path
13- from typing import Any
13+ from typing import Any , Literal
1414
1515_LOCAL_CACHE_ENV_VAR = "SCIPP_DATA_DIR"
1616_LOCAL_REGISTRY_ENV_VAR = "SCIPP_OVERRIDE_DATA_DIR"
@@ -28,8 +28,8 @@ def make_registry(
2828
2929 By default, this function creates a :class:`PoochRegistry` to download files
3030 via HTTP from an online file store.
31- This can be overridden by setting the environment variable `SCIPP_DATA_DIR` to a
32- path on the local file system.
31+ This can be overridden by setting the environment variable
32+ ``SCIPP_OVERRIDE_DATA_DIR`` to a path on the local file system.
3333 In this case, a :class:`LocalRegistry` is returned.
3434
3535 Files are specified as a dict using either the Pooch string format explicitly
@@ -40,7 +40,11 @@ def make_registry(
4040 ... "file1.dat": "md5:1234567890abcdef",
4141 ... "file2.csv": Entry(alg="md5", chk="abcdef123456789"),
4242 ... "folder/nested.dat": "blake2b:1234567890abcdef",
43- ... "zipped.zip": Entry(alg="blake2b", chk="abcdef123456789", unzip=True),
43+ ... "zipped.zip": Entry(
44+ ... alg="blake2b",
45+ ... chk="abcdef123456789",
46+ ... extractor="unzip"
47+ ... ),
4448 ... }
4549
4650 In the example above, the specifications for ``file1.dat`` and ``file2.csv`` are
@@ -49,10 +53,11 @@ def make_registry(
4953 Paths like this must always use forward slashes (/) even on Windows.
5054
5155 As shown above, it is possible to automatically unzip
52- files by specifying ``unzip=True ``.
56+ files by specifying ``extractor="unzip" ``.
5357 When calling ``registry.get_path("zipped.zip")`` the file will be unzipped and
5458 a path to the content is returned.
55- This expects that there is only a single file in the zip archive.
59+ Similarly, ``extractor="untar"`` specifies that a file needs to be untarred
60+ (and possibly un-gzipped).
5661
5762 The complete path to the source file is constructed as follows:
5863
@@ -111,17 +116,26 @@ def _check_local_override_path(override: str) -> Path:
111116 return path
112117
113118
114- @dataclass (frozen = True , slots = True )
119+ @dataclasses . dataclass (frozen = True , slots = True )
115120class Entry :
116121 """An entry in a registry."""
117122
118123 chk : str
119124 """Checksum."""
120125 alg : str
121126 """Checksum algorithm."""
122- unzip : bool = False
127+ extractor : Literal ["unzip" , "untar" ] | None = None
128+ """Processor to extract file contents."""
129+
130+ unzip : dataclasses .InitVar [bool ] = False
123131 """Whether to unzip the file."""
124132
133+ def __post_init__ (self , unzip : bool ) -> None :
134+ if self .extractor is not None and unzip :
135+ raise TypeError ("Set either the 'unzip' argument or 'extractor', not both." )
136+ if self .extractor is None and unzip :
137+ super ().__setattr__ ("extractor" , "unzip" )
138+
125139 @classmethod
126140 def from_pooch_string (cls , pooch_string : str ) -> Entry :
127141 alg , chk = pooch_string .split (":" )
@@ -132,7 +146,7 @@ class Registry(ABC):
132146 def __init__ (self , files : Mapping [str , str | Entry ]) -> None :
133147 self ._files = _to_file_entries (files )
134148
135- @abstractmethod
149+ @cache # noqa: B019
136150 def get_path (self , name : str ) -> Path :
137151 """Get the path to a file in the registry.
138152
@@ -154,9 +168,60 @@ def get_path(self, name: str) -> Path:
154168 :
155169 The Path to the file.
156170 """
171+ return Path (
172+ _expect_single (
173+ self ._fetch (name , extractor = self ._extractor_processor (name )),
174+ name ,
175+ )
176+ )
157177
158- def _needs_unzip (self , name : str ) -> bool :
159- return self ._files [name ].unzip
178+ @cache # noqa: B019
179+ def get_paths (self , name : str ) -> list [Path ]:
180+ """Get the paths to unpacked files from the registry.
181+
182+ This method downloads the given file, extracts its contents, and returns
183+ the paths to all extracted contents.
184+ Unlike :meth:`get_path`, this method requires an extractor processor
185+ (unzip or untar).
186+
187+ Depending on the implementation, the file is downloaded if necessary.
188+
189+ Note that implementations are allowed to cache return values of this method
190+ to avoid recomputing potentially expensive checksums.
191+ This usually means that the ``Registry`` object itself gets stored until the
192+ Python interpreter shuts down.
193+ However, registries are small and do not own resources.
194+
195+ Parameters
196+ ----------
197+ name:
198+ Name of the zipped or tarred file to get the path for.
199+
200+ Returns
201+ -------
202+ :
203+ The Paths to the files.
204+ """
205+ if (extractor := self ._extractor_processor (name )) is None :
206+ raise ValueError (f"File '{ name } ' is not zipped or tarred." )
207+ return [Path (path ) for path in self ._fetch (name , extractor = extractor )]
208+
209+ def _extractor_processor_type (self , name : str ) -> Any :
210+ match self ._files [name ].extractor :
211+ case "unzip" :
212+ return _pooch_unzip_processor_class ()
213+ case "untar" :
214+ return _pooch_untar_processor_class ()
215+ case None :
216+ return None
217+
218+ @abstractmethod
219+ def _extractor_processor (self , name : str ) -> Any :
220+ """Return an instance of a processor for the given file."""
221+
222+ @abstractmethod
223+ def _fetch (self , name : str , extractor : Any ) -> list [str ] | str :
224+ """Fetch the given file from the registry."""
160225
161226
162227class PoochRegistry (Registry ):
@@ -178,24 +243,15 @@ def __init__(
178243 )
179244 super ().__init__ (files )
180245
181- @cache # noqa: B019
182- def get_path (self , name : str ) -> Path :
183- """Get the path to a file in the registry.
184-
185- Downloads the file if necessary.
186- """
187- if self ._needs_unzip (name ):
188- paths : list [str ] = self ._registry .fetch ( # type: ignore[assignment]
189- name , processor = self ._unzip_processor
190- )
191- return Path (_expect_single_unzipped (paths , name ))
192- return Path (self ._registry .fetch (name ))
246+ def _fetch (self , name : str , extractor : Any ) -> list [str ] | str :
247+ return self ._registry .fetch (name , processor = extractor )
193248
194- @property
195- def _unzip_processor (self ) -> Any :
249+ def _extractor_processor (self , name : str ) -> Any :
196250 # Create a new processor on demand because reusing the same processor would
197251 # reuse the same output path for every file.
198- return _import_pooch ().Unzip ()
252+ if (cls := self ._extractor_processor_type (name = name )) is not None :
253+ return cls ()
254+ return None
199255
200256
201257class LocalRegistry (Registry ):
@@ -217,12 +273,11 @@ def __init__(
217273 base_url = base_url ,
218274 retry_if_failed = retry_if_failed ,
219275 )
220- self ._extract_dir = pooch_registry .path
276+ self ._extract_base_dir = pooch_registry .path
221277 self ._source_path = source_path .resolve ().joinpath (* prefix .split ("/" ), version )
222278 super ().__init__ (files )
223279
224- @cache # noqa: B019
225- def get_path (self , name : str ) -> Path :
280+ def _fetch (self , name : str , extractor : Any ) -> list [str ] | str :
226281 """Get the path to a file in the registry."""
227282 try :
228283 entry = self ._files [name ]
@@ -238,24 +293,24 @@ def get_path(self, name: str) -> Path:
238293
239294 _check_hash (name , path , entry )
240295
241- if self ._needs_unzip (name ):
242- return Path (
243- _expect_single_unzipped (
244- self ._unzip_processor (os .fspath (path ), "download" , None ), path
245- )
246- )
247- return path
296+ if extractor is not None :
297+ return extractor (os .fspath (path ), "download" , None )
298+ return os .fspath (path )
248299
249300 def _local_path (self , name : str ) -> Path :
250301 # Split on "/" because `name` is always a POSIX-style path, but the return
251302 # value is a system path, i.e., it can be a Windows-style path.
252303 return self ._source_path .joinpath (* name .split ("/" ))
253304
254- @property
255- def _unzip_processor (self ) -> Any :
305+ def _extract_dir (self , name : str ) -> Path :
306+ return self ._extract_base_dir / name
307+
308+ def _extractor_processor (self , name : str ) -> Any :
256309 # Create a new processor on demand because reusing the same processor would
257310 # reuse the same output path for every file.
258- return _import_pooch ().Unzip (self ._extract_dir )
311+ if (cls := self ._extractor_processor_type (name = name )) is not None :
312+ return cls (extract_dir = self ._extract_dir (name ))
313+ return None
259314
260315
261316def _import_pooch () -> Any :
@@ -288,19 +343,30 @@ def _create_pooch(
288343 )
289344
290345
291- def _pooch_unzip_processor ( extract_dir : Path ) -> Any :
346+ def _pooch_unzip_processor_class ( ) -> Any :
292347 try :
293348 import pooch
294349 except ImportError :
295350 raise ImportError ("You need to install Pooch to unzip files." ) from None
296351
297- return pooch .processors .Unzip (extract_dir = os .fspath (extract_dir ))
352+ return pooch .processors .Unzip
353+
354+
355+ def _pooch_untar_processor_class () -> Any :
356+ try :
357+ import pooch
358+ except ImportError :
359+ raise ImportError ("You need to install Pooch to untar files." ) from None
360+
361+ return pooch .processors .Untar
298362
299363
300- def _expect_single_unzipped (paths : list [str ], archive : str | os .PathLike ) -> str :
364+ def _expect_single (paths : list [str ] | str , archive : str | os .PathLike ) -> str :
365+ if isinstance (paths , str ):
366+ return paths
301367 if len (paths ) != 1 :
302368 raise ValueError (
303- f"Expected exactly one file to unzip , got { len (paths )} in "
369+ f"Expected exactly one extracted file , got { len (paths )} in "
304370 f"'{ os .fspath (archive )} '."
305371 )
306372 return paths [0 ]
0 commit comments