Skip to content

Commit 7fa27f1

Browse files
committed
dags: Resize images that are too large for Bedrock
* Add `piffle` dep for IIIF metadata processing. * Add variable `TIND_IIIF_MANIFEST_URL_PATTERN` to define where the IIIF manifests live for the given TIND API. * Add helper code to the FetchTind class to fetch the image. This will likely be refactored to the TIND Client package later. * Add logic for determining the correct size. Implements: AP-672
1 parent e3604c4 commit 7fa27f1

6 files changed

Lines changed: 134 additions & 11 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ Important environment variables for our build/environment:
9393
| `OIDC_USER_GROUP` | Similar to admin group. This group is for users in both admin and user roles.| `OIDC_USER_GROUP="cn=edu:berkeley:org:libr:mokelumne:users,ou=campus groups,dc=berkeley,dc=edu"` |
9494
| `TIND_API_KEY` | API key for TIND access | `TIND_API_KEY="..."` |
9595
| `TIND_API_URL` | URL for TIND access | `TIND_API_URL="https://digicoll.lib.berkeley.edu/api/v1"` |
96+
| `TIND_IIIF_MANIFEST_URL_PATTERN` | URL pattern for TIND IIIF manifests | `TIND_IIIF_MANIFEST_URL_PATTERN="https://digicoll.lib.berkeley.edu/record/{tind_id}/export/iiif_manifest"` |
9697
| `MOKELUMNE_TIND_DOWNLOAD_DIR` | Path for downloaded image cache | `MOKELUMNE_TIND_DOWNLOAD_DIR="/some/path/to/download/to"` |
9798
|`LANGFUSE_HOST`|Host for Langfuse|`LANGFUSE_HOST="https://us.cloud.langfuse.com"`|
9899
|`LANGFUSE_SECRET_KEY`|sets langfuse secret key|`LANGFUSE_SECRET_KEY="sk-lf-blah-blah-blah"`|

example.env

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ OIDC_WELL_KNOWN="http://keycloak:8180/realms/berkeley-local/.well-known/openid-c
2222
# TBD
2323
TIND_API_KEY=
2424
TIND_API_URL=
25+
TIND_IIIF_MANIFEST_URL_PATTERN=https://digicoll.lib.berkeley.edu/record/{tind_id}/export/iiif_manifest
2526

2627
LANGFUSE_HOST=https://us.cloud.langfuse.com
2728
LANGFUSE_SECRET_KEY=sk-lf-blah-blah-blah

mokelumne/dags/fetch_images.py

Lines changed: 65 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44

55
import csv
66
import logging
7+
import os
78

89
from collections import namedtuple
10+
from math import ceil, trunc
911
from pathlib import Path
1012
from typing import List
1113

@@ -20,6 +22,22 @@
2022
RunStatus = namedtuple('RunStatus', ('tind_id', 'status', 'path'))
2123

2224
SUPPORTED_IMAGE_TYPES = {"image/jpeg", "image/png", "image/gif", "image/webp"}
25+
"""The supported image MIME types we will fetch."""
26+
27+
SIZE_LIMIT: int = 3750000
28+
"""The upper bound for a size of image in bytes that we will fetch."""
29+
30+
31+
def base64_size(s: int | float) -> int:
32+
"""Determine the base64-encoded size of an object given its original size.
33+
34+
:param s: The original size of the object.
35+
:returns: The base64-encoded size of the object.
36+
:note: The result of this function has the same unit as its parameter.
37+
For example, passing 4.5 MiB will result in 8 (MiB).
38+
"""
39+
return int(4 * (ceil(s * 4 / 3) / 4))
40+
2341

2442
@dag(schedule=[to_process_csv], catchup=False, tags=["tind", "fetch", "batch-image"])
2543
def fetch_images():
@@ -59,17 +77,57 @@ def fetch_image_to_record_directory(orig_run_id: str, tind_id: str) -> RunStatus
5977
"""Fetch an image from TIND to the target record's storage directory."""
6078
try:
6179
client = FetchTind(orig_run_id)
62-
filemd = client.client.fetch_file_metadata(tind_id)
63-
if filemd[0].get("mime") in SUPPORTED_IMAGE_TYPES:
64-
path = client.download_image_file(tind_id)
65-
status = "fetched"
80+
filemd = client.get_first_file_metadata(tind_id)
81+
if not filemd.get("mime") in SUPPORTED_IMAGE_TYPES:
82+
return RunStatus(tind_id=tind_id, path="",
83+
status=f"skipped: Unsupported file type {filemd.get('mime')}")
84+
85+
target_width = width = filemd.get("width", 0)
86+
target_height = height = filemd.get("height", 0)
87+
size = filemd.get("size", 0)
88+
b64_size = base64_size(size)
89+
logger.warning("b64_size = %d, size = %d", b64_size, size)
90+
if b64_size > SIZE_LIMIT:
91+
factor = float(SIZE_LIMIT) / b64_size
92+
target_width *= factor
93+
target_height *= factor
94+
95+
if target_width > 8000:
96+
factor = 8000.0 / width
97+
target_width *= factor
98+
target_height *= factor
99+
100+
if target_height > 8000:
101+
factor = 8000.0 / height
102+
target_width *= factor
103+
target_height *= factor
104+
105+
target_width = int(trunc(target_width))
106+
target_height = int(trunc(target_height))
107+
108+
if (width != target_width) or (height != target_height):
109+
# downsample using IIIF
110+
path = client.download_image_from_record_sized(tind_id, target_width, target_height)
111+
112+
# TIND resampling may cause the image to be larger than original. recalculate.
113+
# in my testing, 290827 resampled to 5998x8000 went from 2.5 MB to 4.9 MB(!)
114+
new_size = os.stat(path).st_size
115+
b64_size = base64_size(new_size)
116+
if b64_size > SIZE_LIMIT:
117+
factor = float(SIZE_LIMIT) / b64_size
118+
target_width *= factor
119+
target_height *= factor
120+
target_width = int(trunc(target_width))
121+
target_height = int(trunc(target_height))
122+
# Only re-download if it actually does exceed the limit.
123+
path = client.download_image_from_record_sized(tind_id, target_width, target_height)
66124
else:
67-
path = ""
68-
status = f"skipped: Unsupported file type {filemd[0].get('mime')}"
125+
path = client.download_image_file(tind_id)
69126
except Exception as ex: # pylint: disable=broad-exception-caught
127+
logger.warning("Fetcher encountered exception", exc_info=ex)
70128
return RunStatus(tind_id=tind_id, status=f'failed: {str(ex)}', path='')
71129

72-
return RunStatus(tind_id=tind_id, status=status, path=path)
130+
return RunStatus(tind_id=tind_id, status="fetched", path=path)
73131

74132
@task(outlets=[fetched_csv])
75133
def write_status_to_fetched_csv(orig_run_id: str, records: dict[str, list[str]],

mokelumne/util/fetch_tind.py

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,21 @@
11
"""Provides a helper class, FetchTind, to download information from TIND."""
22

3-
import csv
3+
import logging
4+
from os import environ as ENV
5+
from typing import Any
46

5-
from tind_client import TINDClient
7+
import requests
8+
from piffle.image import IIIFImageClient
9+
from piffle.load_iiif import load_iiif_presentation
10+
from tind_client import TINDClient, TINDError
611

712
from mokelumne.util.storage import run_dir, record_dir
813

914

15+
logger = logging.getLogger(__name__)
16+
"""The TIND Fetcher logger."""
17+
18+
1019
class FetchTind:
1120
"""Helper methods for fetching items from TIND using TINDClient."""
1221
def __init__(self, _run_id: str):
@@ -17,13 +26,53 @@ def get_ids(self, tind_query: str) -> list[str]:
1726
"""Return the TIND IDs that match a given query."""
1827
return self.client.fetch_ids_search(tind_query)
1928

29+
def get_first_file_metadata(self, tind_id: str) -> dict[str, Any]:
30+
"""Return the file metadata for a given TIND ID."""
31+
record = self.client.fetch_file_metadata(tind_id)
32+
if not record or not record[0]:
33+
return {}
34+
return record[0]
35+
2036
def download_image_file(self, tind_id: str) -> str:
2137
"""Download the first file attachment for a given TIND ID."""
22-
record = self.client.fetch_file_metadata(tind_id)
23-
download_url = record[0]["url"]
38+
metadata = self.get_first_file_metadata(tind_id)
39+
download_url = metadata["url"]
2440
record_path = record_dir(self.run_id, tind_id)
2541
return self.client.fetch_file(download_url, str(record_path))
2642

43+
def download_image_from_record_sized(self, tind_id: str, width: int, height: int) -> str:
44+
"""Download the first image for a given TIND ID with the given size.
45+
46+
:param tind_id: The TIND record ID.
47+
:param int width: The desired width of the image in pixels.
48+
:param int height: The desired height of the image in pixels.
49+
:returns: The path where the file was saved.
50+
"""
51+
url = ENV.get(
52+
"TIND_IIIF_MANIFEST_URL_PATTERN",
53+
"https://digicoll.lib.berkeley.edu/record/{tind_id}/export/iiif_manifest"
54+
).format(tind_id=tind_id)
55+
56+
manifest = load_iiif_presentation(url)
57+
canvases = len(manifest.items)
58+
if canvases != 1:
59+
logger.warning("%s: manifest has invalid number of canvases: %d; crash may follow",
60+
tind_id, canvases)
61+
62+
# Manifest -> Canvas -> AnnotationPage -> Annotation -> Image
63+
image_id = manifest.items[0].items[0].items[0].body["id"]
64+
image = IIIFImageClient(*image_id.rsplit("/", 1))
65+
66+
data = requests.get(str(image.size(width=width, height=height, exact=True)))
67+
data.raise_for_status()
68+
69+
output_path = record_dir(self.run_id, tind_id) / image.image_id
70+
with output_path.open('wb') as out_f:
71+
for chunk in data.iter_content():
72+
out_f.write(chunk)
73+
74+
return str(output_path)
75+
2776
def write_query_results_to_xml(self, tind_query: str, file_name: str = "") -> int:
2877
"""Download the XML results of a search query from TIND."""
2978
records_written = self.client.write_search_results_to_file(tind_query, file_name)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ dependencies = [
99
"apache-airflow-task-sdk == 1.2.1",
1010
"langchain[aws]",
1111
"langfuse",
12+
"piffle", # IIIF API used in batch image description DAG
1213
"pymarc",
1314
"python-tind-client",
1415
]

requirements.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ a2wsgi==1.10.10 \
44
--hash=sha256:a5bcffb52081ba39df0d5e9a884fc6f819d92e3a42389343ba77cbf809fe1f45 \
55
--hash=sha256:d2b21379479718539dc15fce53b876251a0efe7615352dfe49f6ad1bc507848d
66
# via apache-airflow-core
7+
addict==2.4.0 \
8+
--hash=sha256:249bb56bbfd3cdc2a004ea0ff4c2b6ddc84d53bc2194761636eb314d5cfa5dfc \
9+
--hash=sha256:b3b2210e0e067a281f5646c8c5db92e99b7231ea8b0eb5f74dbdf9e259d4e494
10+
# via piffle
711
aiosmtplib==5.1.0 \
812
--hash=sha256:2504a23b2b63c9de6bc4ea719559a38996dba68f73f6af4eb97be20ee4c5e6c4 \
913
--hash=sha256:368029440645b486b69db7029208a7a78c6691b90d24a5332ddba35d9109d55b
@@ -119,6 +123,10 @@ botocore==1.42.84 \
119123
# via
120124
# boto3
121125
# s3transfer
126+
cached-property==2.0.1 \
127+
--hash=sha256:484d617105e3ee0e4f1f58725e72a8ef9e93deee462222dbd51cd91230897641 \
128+
--hash=sha256:f617d70ab1100b7bcf6e42228f9ddcb78c676ffa167278d9f730d1c2fba69ccb
129+
# via piffle
122130
cadwyn==6.2.0 \
123131
--hash=sha256:896901fd3b8425a550e08afb3dff4081c642ce4a9667bc66670904b1908b21b6 \
124132
--hash=sha256:ef9aff5279494b82d90e67bc68cb32b1413394346f9dea85db47ad40a2462b1a
@@ -1504,6 +1512,10 @@ pendulum==3.2.0 \
15041512
# via
15051513
# apache-airflow-core
15061514
# apache-airflow-task-sdk
1515+
piffle==0.7.0 \
1516+
--hash=sha256:00f5b9e74668e6e068ee72999158d0cf49f9c75e9b02426a1f842add9253b5c3 \
1517+
--hash=sha256:64a8e087c5fedccae118a15efbb33938370fd9ac1bec01ff64dfc8f5e732718f
1518+
# via mokelumne (pyproject.toml)
15071519
playwright==1.58.0 \
15081520
--hash=sha256:185e0132578733d02802dfddfbbc35f42be23a45ff49ccae5081f25952238117 \
15091521
--hash=sha256:1e03be090e75a0fabbdaeab65ce17c308c425d879fa48bb1d7986f96bfad0b99 \
@@ -1885,6 +1897,7 @@ requests==2.33.1 \
18851897
# apache-airflow-core
18861898
# langsmith
18871899
# opentelemetry-exporter-otlp-proto-http
1900+
# piffle
18881901
# pytest-base-url
18891902
# python-tind-client
18901903
# requests-toolbelt

0 commit comments

Comments
 (0)