Skip to content

Commit d729c95

Browse files
committed
AP-676: do not fetch files of an unsupported mime type
1 parent 9120273 commit d729c95

2 files changed

Lines changed: 10 additions & 3 deletions

File tree

mokelumne/dags/fetch_images.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
RunStatus = namedtuple('RunStatus', ('tind_id', 'status', 'path'))
2121

22+
SUPPORTED_IMAGE_TYPES = {"image/jpeg", "image/png", "image/gif", "image/webp"}
2223

2324
@dag(schedule=[to_process_csv], catchup=False, tags=["tind", "fetch", "batch-image"])
2425
def fetch_images():
@@ -58,11 +59,17 @@ def fetch_image_to_record_directory(orig_run_id: str, tind_id: str) -> RunStatus
5859
"""Fetch an image from TIND to the target record's storage directory."""
5960
try:
6061
client = FetchTind(orig_run_id)
61-
path = client.download_image_file(tind_id)
62+
filemd = client.client.fetch_file_metadata(tind_id)
63+
if filemd[0].get("mime") in SUPPORTED_IMAGE_TYPES:
64+
path = client.download_image_file(tind_id)
65+
status = "fetched"
66+
else:
67+
path = ""
68+
status = f"skipped: Unsupported file type {filemd[0].get('mime')}"
6269
except Exception as ex: # pylint: disable=broad-exception-caught
6370
return RunStatus(tind_id=tind_id, status=f'failed: {str(ex)}', path='')
6471

65-
return RunStatus(tind_id=tind_id, status='fetched', path=path)
72+
return RunStatus(tind_id=tind_id, status=status, path=path)
6673

6774
@task(outlets=[fetched_csv])
6875
def write_status_to_fetched_csv(orig_run_id: str, records: dict[str, list[str]],

mokelumne/dags/generate_image_descriptions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def read_and_batch_csv() -> list[list[dict[str, str]]]:
6161

6262
with open(fetched_csv_path, mode="r", encoding="utf-8") as f:
6363
reader = csv.DictReader(f)
64-
rows = list(reader)
64+
rows = list(filter(lambda x: x["Status"] == "fetched", reader))
6565

6666
# we could make the chunking a parameter or an env variable
6767
return [rows[i : i + 10] for i in range(0, len(rows), 10)]

0 commit comments

Comments
 (0)