Move read ordering back to SQL query

ghukill · ghukill · commit 51a35ac6e814 · 2025-08-25T16:25:20.000-04:00
Why these changes are being introduced: The ordering of metadata records by filename + run_record_offset was moved into the python pandas context for a performance boost, but it was not ideal from the POV of keeping the majority of our logic in SQL. Upon learning that we could use `hash(filename)` to still order the filenames but with a dramatic speed and memory improvement, it makes sense to move this back into the SQL context. How this addresses that need: * Moves metadata query ordering back to SQL instead of python pandas context Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-543
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -387,7 +387,6 @@ def read_batches_iter(
         metadata_time = time.perf_counter()
         meta_query = self.metadata.build_meta_query(table, limit, where, **filters)
         meta_df = self.metadata.conn.query(meta_query).to_df()
-        meta_df = meta_df.sort_values(by=["filename", "run_record_offset"])
         logger.debug(
             f"Metadata query identified {len(meta_df)} rows, "
             f"across {len(meta_df.filename.unique())} parquet files, "
diff --git a/timdex_dataset_api/metadata.py b/timdex_dataset_api/metadata.py
@@ -12,7 +12,7 @@
 import duckdb
 from duckdb import DuckDBPyConnection
 from duckdb_engine import Dialect as DuckDBDialect
-from sqlalchemy import Table, and_, select, text
+from sqlalchemy import Table, and_, func, select, text
 
 from timdex_dataset_api.config import configure_logger
 from timdex_dataset_api.utils import (
@@ -642,6 +642,14 @@ def build_meta_query(
         if combined is not None:
             stmt = stmt.where(combined)
 
+        # order by filename + run_record_offset
+        # NOTE: we use a hash of the filename for ordering for a dramatic speedup, where
+        #   we don't really care about the exact order, just that they are ordered
+        stmt = stmt.order_by(
+            func.hash(sa_table.c.filename),
+            sa_table.c.run_record_offset,
+        )
+
         # apply limit if present
         if limit:
             stmt = stmt.limit(limit)