Use keyset pagination for meta and data read methods

ghukill · ghukill · commit efb3973df303 · 2025-08-29T09:05:48.000-04:00
Why these changes are being introduced: For all read methods, the former approach was to perform a metadata query and store the entire results in memory, then loop through chunks of that metadata and build SQL queries to perform data retrieval. Even for metadata queries that may bring back 3-4 million results, this worked, but there is an upper limit. Ideally, we would perform all of our queries -- metadata and data -- in chunks to ease memory pressure. And in some cases, this can increase performance. How this addresses that need: This reworks the base read_batches_iter() method to perform smaller, chunked metadata queries. To paginate the results, instead of using the slow LIMIT / OFFSET approach, we use keyset pagination, which means we can look for values greater than a tuple of values that are ordered. This is often the preferred way to perform paginated querying when you have nicely ordered columns. In support of this, we also begin hashing the filename and run_id columns for ordering, providing almost an order magnitude speedup. The performance penalty for creating the hash is offset by the speedup of ordering integers versus very long strings. The net effect is no changes to the input/ouput signatures of the read methods, but improved memory usage and performance. Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-543
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -114,7 +114,7 @@ def timdex_dataset_multi_source(tmp_path_factory) -> TIMDEXDataset:
 
     # ensure static metadata database exists for read methods
     dataset.metadata.rebuild_dataset_metadata()
-    dataset.metadata.refresh()
+    dataset.refresh()
 
     return dataset
 
@@ -234,7 +234,7 @@ def timdex_dataset_with_runs_with_metadata(
 ) -> TIMDEXDataset:
     """TIMDEXDataset with runs and static metadata created for read tests."""
     timdex_dataset_with_runs.metadata.rebuild_dataset_metadata()
-    timdex_dataset_with_runs.metadata.refresh()
+    timdex_dataset_with_runs.refresh()
     return timdex_dataset_with_runs
 
 
diff --git a/timdex_dataset_api/__init__.py b/timdex_dataset_api/__init__.py
@@ -4,7 +4,7 @@
 from timdex_dataset_api.metadata import TIMDEXDatasetMetadata
 from timdex_dataset_api.record import DatasetRecord
 
-__version__ = "3.1.0"
+__version__ = "3.2.0"
 
 __all__ = [
     "DatasetRecord",
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
@@ -15,6 +15,7 @@
 import boto3
 import pandas as pd
 import pyarrow as pa
+import pyarrow.compute as pc
 import pyarrow.dataset as ds
 from duckdb import DuckDBPyConnection
 from pyarrow import fs
@@ -364,7 +365,7 @@ def read_batches_iter(
     ) -> Iterator[pa.RecordBatch]:
         """Yield ETL records as pyarrow.RecordBatches.
 
-        This method performs a two step process:
+        This method performs a two-step process:
 
             1. Perform a "metadata" query that narrows down records and physical parquet
             files to read from.
@@ -383,33 +384,36 @@ def read_batches_iter(
         """
         start_time = time.perf_counter()
 
-        # build and execute metadata query
-        metadata_time = time.perf_counter()
-        meta_query = self.metadata.build_meta_query(table, limit, where, **filters)
-        meta_df = self.metadata.conn.query(meta_query).to_df()
-        logger.debug(
-            f"Metadata query identified {len(meta_df)} rows, "
-            f"across {len(meta_df.filename.unique())} parquet files, "
-            f"elapsed: {round(time.perf_counter()-metadata_time,2)}s"
-        )
-
         # execute data queries in batches and yield results
         total_yield_count = 0
-        for i, meta_chunk_df in enumerate(self._iter_meta_chunks(meta_df)):
+        for i, meta_chunk in enumerate(
+            self._iter_meta_chunks(
+                table,
+                limit=limit,
+                where=where,
+                **filters,
+            )
+        ):
             batch_time = time.perf_counter()
-            batch_yield_count = len(meta_chunk_df)
+
+            batch_yield_count = meta_chunk.num_rows
             total_yield_count += batch_yield_count
 
-            if batch_yield_count == 0:
-                continue
+            # register meta chunk as DuckDB asset
+            self.conn.register("meta_chunk", meta_chunk)
 
-            self.conn.register("meta_chunk", meta_chunk_df)
+            # perform data query and yield results
             data_query = self._build_data_query_for_chunk(
                 columns,
-                meta_chunk_df,
+                meta_chunk,
                 registered_metadata_chunk="meta_chunk",
             )
-            yield from self._stream_data_query_batches(data_query)
+            cursor = self.conn.execute(data_query)
+            yield from cursor.fetch_record_batch(
+                rows_per_batch=self.config.read_batch_size
+            )
+
+            # deregister meta chunk
             self.conn.unregister("meta_chunk")
 
             batch_rps = int(batch_yield_count / (time.perf_counter() - batch_time))
@@ -422,32 +426,94 @@ def read_batches_iter(
             f"read_batches_iter() elapsed: {round(time.perf_counter()-start_time, 2)}s"
         )
 
-    def _iter_meta_chunks(self, meta_df: pd.DataFrame) -> Iterator[pd.DataFrame]:
-        """Utility method to yield chunks of metadata query results."""
-        for start in range(0, len(meta_df), self.config.duckdb_join_batch_size):
-            yield meta_df.iloc[start : start + self.config.duckdb_join_batch_size]
+    def _iter_meta_chunks(
+        self,
+        table: str = "records",
+        limit: int | None = None,
+        where: str | None = None,
+        **filters: Unpack[DatasetFilters],
+    ) -> Iterator[pa.lib.Table]:
+        """Utility method to yield pyarrow Table chunks of metadata query results.
 
-    def _build_parquet_file_list(self, meta_chunk_df: pd.DataFrame) -> str:
-        """Build SQL list of parquet filepaths."""
-        filenames = meta_chunk_df["filename"].unique().tolist()
-        if self.location_scheme == "s3":
-            filenames = [f"s3://{f.removeprefix('s3://')}" for f in filenames]
-        return "[" + ",".join((f"'{f}'") for f in filenames) + "]"
+        The approach here is to use "keyset" pagination, which means each paged result
+        is a greater-than (>) check against a tuple of ordered values from the previous
+        chunk.  This is more performant than a LIMIT + OFFSET.
+        """
+        # use duckdb_join_batch_size as the chunk size for keyset pagination
+        chunk_size = self.config.duckdb_join_batch_size
+
+        # init keyset value of zeros to begin with
+        keyset_value = (0, 0, 0)
+
+        total_yielded = 0
+        while True:
+
+            # enforce limit if passed
+            if limit is not None:
+                remaining = limit - total_yielded
+                if remaining <= 0:
+                    break
+                chunk_limit = min(chunk_size, remaining)
+            else:
+                chunk_limit = chunk_size
+
+            # perform chunk query and convert to pyarrow Table
+            meta_query = self.metadata.build_keyset_paginated_metadata_query(
+                table,
+                limit=chunk_limit,  # pass chunk_limit instead of limit
+                where=where,
+                keyset_value=keyset_value,
+                **filters,
+            )
+            meta_chunk = self.metadata.conn.query(meta_query).to_arrow_table()
+
+            # an empty chunk signals end of pagination
+            if meta_chunk.num_rows == 0:
+                break
+
+            # yield this chunk of data
+            total_yielded += meta_chunk.num_rows
+            yield meta_chunk
+
+            # update keyset value using the last row from this chunk
+            keyset_value = (
+                meta_chunk["filename_hash"][-1].as_py(),
+                meta_chunk["run_id_hash"][-1].as_py(),
+                meta_chunk["run_record_offset"][-1].as_py(),
+            )
 
     def _build_data_query_for_chunk(
         self,
         columns: list[str] | None,
-        meta_chunk_df: pd.DataFrame,
+        meta_chunk: pa.lib.Table,
         registered_metadata_chunk: str = "meta_chunk",
     ) -> str:
-        """Build SQL query used for data retrieval, joining on metadata data."""
-        parquet_list_sql = self._build_parquet_file_list(meta_chunk_df)
-        rro_list_sql = ",".join(
-            str(rro) for rro in meta_chunk_df["run_record_offset"].unique()
-        )
+        """Build SQL query used for data retrieval, joining on passed metadata data."""
+        # build list of explicit parquet files to read from
+        filenames = pc.unique(meta_chunk["filename"]).to_pylist()
+        if self.location_scheme == "s3":
+            filenames = [
+                f"s3://{f.removeprefix('s3://')}" for f in filenames  # type: ignore[union-attr]
+            ]
+        parquet_list_sql = "[" + ",".join((f"'{f}'") for f in filenames) + "]"
+
+        # build select columns
         select_cols = ",".join(
             [f"ds.{col}" for col in (columns or TIMDEX_DATASET_SCHEMA.names)]
         )
+
+        # build run_record_offset WHERE clause to leverage row group pruning
+        rro_values = pc.unique(meta_chunk["run_record_offset"]).to_pylist()
+        rro_values.sort()
+        if len(rro_values) <= 1000:  # noqa: PLR2004
+            rro_clause = (
+                f"and run_record_offset in ({','.join(str(rro) for rro in rro_values)})"
+            )
+        else:
+            rro_clause = (
+                f"and run_record_offset between {rro_values[0]} and {rro_values[-1]}"
+            )
+
         return f"""
             select
                 {select_cols}
@@ -459,16 +525,10 @@ def _build_data_query_for_chunk(
             inner join {registered_metadata_chunk} mc using (
                 timdex_record_id, run_id, run_record_offset
             )
-            where ds.run_record_offset in ({rro_list_sql});
+            where true
+            {rro_clause};
             """
 
-    def _stream_data_query_batches(self, data_query: str) -> Iterator[pa.RecordBatch]:
-        """Yield pyarrow RecordBatches from a SQL query."""
-        self.conn.execute("set enable_progress_bar = false;")
-        cursor = self.conn.execute(data_query)
-        yield from cursor.fetch_record_batch(rows_per_batch=self.config.read_batch_size)
-        self.conn.execute("set enable_progress_bar = true;")
-
     def read_dataframes_iter(
         self,
         table: str = "records",
diff --git a/timdex_dataset_api/metadata.py b/timdex_dataset_api/metadata.py
@@ -12,7 +12,7 @@
 import duckdb
 from duckdb import DuckDBPyConnection
 from duckdb_engine import Dialect as DuckDBDialect
-from sqlalchemy import Table, and_, func, select, text
+from sqlalchemy import Table, func, literal, select, text, tuple_
 
 from timdex_dataset_api.config import configure_logger
 from timdex_dataset_api.utils import (
@@ -619,42 +619,56 @@ def write_append_delta_duckdb(self, filepath: str) -> None:
             f"Append delta written: {output_path}, {time.perf_counter()-start_time}s"
         )
 
-    def build_meta_query(
+    def build_keyset_paginated_metadata_query(
         self,
         table: str,
-        limit: int | None,
-        where: str | None,
+        *,
+        limit: int | None = None,
+        where: str | None = None,
+        keyset_value: tuple[int, int, int] = (0, 0, 0),
         **filters: Unpack["DatasetFilters"],
     ) -> str:
         """Build SQL query using SQLAlchemy against metadata schema tables and views."""
         sa_table = self.get_sa_table(table)
 
-        # build WHERE clause filter expression based on any passed key/value filters
-        # and/or an explicit WHERE string
-        filter_expr = build_filter_expr_sa(sa_table, **filters)
-        if where is not None and where.strip():
-            text_where = text(where)
-            combined = (
-                and_(filter_expr, text_where) if filter_expr is not None else text_where
-            )
-        else:
-            combined = filter_expr
-
         # create SQL statement object
         stmt = select(
             sa_table.c.timdex_record_id,
             sa_table.c.run_id,
+            func.hash(sa_table.c.run_id).label("run_id_hash"),
             sa_table.c.run_record_offset,
             sa_table.c.filename,
+            func.hash(sa_table.c.filename).label("filename_hash"),
         ).select_from(sa_table)
-        if combined is not None:
-            stmt = stmt.where(combined)
+
+        # filter expressions from key/value filters (may return None)
+        filter_expr = build_filter_expr_sa(sa_table, **filters)
+        if filter_expr is not None:
+            stmt = stmt.where(filter_expr)
+
+        # explicit raw WHERE string
+        if where is not None and where.strip():
+            stmt = stmt.where(text(where))
+
+        # keyset pagination
+        filename_has, run_id_hash, run_record_offset_ = keyset_value
+        stmt = stmt.where(
+            tuple_(
+                func.hash(sa_table.c.filename),
+                func.hash(sa_table.c.run_id),
+                sa_table.c.run_record_offset,
+            )
+            > tuple_(
+                literal(filename_has),
+                literal(run_id_hash),
+                literal(run_record_offset_),
+            )
+        )
 
         # order by filename + run_record_offset
-        # NOTE: we use a hash of the filename for ordering for a dramatic speedup, where
-        #   we don't really care about the exact order, just that they are ordered
         stmt = stmt.order_by(
             func.hash(sa_table.c.filename),
+            func.hash(sa_table.c.run_id),
             sa_table.c.run_record_offset,
         )
 
@@ -667,7 +681,4 @@ def build_meta_query(
             dialect=DuckDBDialect(),
             compile_kwargs={"literal_binds": True},
         )
-        compiled_str = str(compiled)
-        logger.debug(compiled_str)
-
-        return compiled_str
+        return str(compiled)