Rename timestamp to embeddings_timestamp

ghukill · ghukill · commit b12577226765 · 2025-12-16T09:15:13.000-05:00
Why these changes are being introduced: A PR discussion pointed out that `timestamp` was the only column in the new embeddings schema that, for columns unique to this schema, doesn't prefix with `embeddings_`. There was some discussion that for any of those columns it's implied they are part of the embeddings schema so the `embeddings_` prefix is not necessary. This would hold true in a pure SQL world with tables and views acting like namespaces. But at the level of this python library, where we support quite a bit of kwargs filtering of read methods, that kind of namespacing is awkward. We would like to be able to filter by `run_timestamp` or `embeddings_timestamp`, or in the future maybe `fulltext_timestamp`. Being verbose with our columns names simplifies this a bit. How this addresses that need: Renames the column `timestamp` to `embedding_timestamp` in the embeddings schema. Side effects of this change: * The CLI timdex-embeddings will need an update to what columns it writes to Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-143 * https://mitlibraries.atlassian.net/browse/USE-288
diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py
@@ -31,7 +31,7 @@ def test_dataset_embedding_init():
         "run_record_offset": 0,
         "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
         "embedding_strategy": "full_record",
-        "timestamp": "2024-12-01T10:00:00+00:00",
+        "embedding_timestamp": "2024-12-01T10:00:00+00:00",
         "embedding_vector": [0.1, 0.2, 0.3],
         "embedding_object": json.dumps(
             {"token1": 0.1, "token2": 0.2, "token3": 0.3}
@@ -41,7 +41,7 @@ def test_dataset_embedding_init():
 
     assert embedding
     assert embedding.timdex_record_id == "alma:123"
-    assert embedding.timestamp == datetime(2024, 12, 1, 10, 0, tzinfo=UTC)
+    assert embedding.embedding_timestamp == datetime(2024, 12, 1, 10, 0, tzinfo=UTC)
     assert embedding.embedding_object == b'{"token1": 0.1, "token2": 0.2, "token3": 0.3}'
 
 
@@ -52,7 +52,7 @@ def test_dataset_embedding_date_properties():
         run_record_offset=0,
         embedding_model="sentence-transformers/all-MiniLM-L6-v2",
         embedding_strategy="full_record",
-        timestamp="2024-12-01T10:00:00+00:00",
+        embedding_timestamp="2024-12-01T10:00:00+00:00",
         embedding_vector=[0.1, 0.2, 0.3],
     )
 
@@ -66,7 +66,7 @@ def test_dataset_embedding_to_dict():
         "run_record_offset": 0,
         "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
         "embedding_strategy": "full_record",
-        "timestamp": "2024-12-01T10:00:00+00:00",
+        "embedding_timestamp": "2024-12-01T10:00:00+00:00",
         "embedding_vector": [0.1, 0.2, 0.3],
         "embedding_object": None,
     }
@@ -352,14 +352,14 @@ def test_current_embeddings_view_handles_duplicate_run_embeddings(
     # first embeddings run for run "lemon-2"
     td.embeddings.write(
         generate_sample_embeddings_for_run(
-            td, run_id="lemon-2", timestamp="2025-08-02T00:00:00+00:00"
+            td, run_id="lemon-2", embedding_timestamp="2025-08-02T00:00:00+00:00"
         )
     )
 
     # second embeddings run for run "lemon-2" with a later timestamp
     td.embeddings.write(
         generate_sample_embeddings_for_run(
-            td, run_id="lemon-2", timestamp="2025-08-03T00:00:00+00:00"
+            td, run_id="lemon-2", embedding_timestamp="2025-08-03T00:00:00+00:00"
         )
     )
 
@@ -372,7 +372,7 @@ def test_current_embeddings_view_handles_duplicate_run_embeddings(
     assert len(all_lemon_2) == 10
 
     # verify both timestamps exist
-    unique_timestamps = all_lemon_2["timestamp"].unique()
+    unique_timestamps = all_lemon_2["embedding_timestamp"].unique()
     assert len(unique_timestamps) == 2
 
     # query current_embeddings for lemon source
@@ -392,8 +392,8 @@ def test_current_embeddings_view_handles_duplicate_run_embeddings(
     assert (lemon_2_result["run_date"] == date(2025, 8, 2)).all()
 
     # all lemon-2 current embeddings should have the later embedding timestamp
-    max_timestamp = all_lemon_2["timestamp"].max()
-    assert (lemon_2_result["timestamp"] == max_timestamp).all()
+    max_timestamp = all_lemon_2["embedding_timestamp"].max()
+    assert (lemon_2_result["embedding_timestamp"] == max_timestamp).all()
 
 
 def test_embeddings_view_includes_all_embeddings(timdex_dataset_for_embeddings_views):
@@ -405,14 +405,14 @@ def test_embeddings_view_includes_all_embeddings(timdex_dataset_for_embeddings_v
     # write embeddings for lemon-2 (first time) with explicit timestamp
     td.embeddings.write(
         generate_sample_embeddings_for_run(
-            td, run_id="lemon-2", timestamp="2025-08-02T00:00:00+00:00"
+            td, run_id="lemon-2", embedding_timestamp="2025-08-02T00:00:00+00:00"
         )
     )
 
     # write embeddings for lemon-2 again with later timestamp
     td.embeddings.write(
         generate_sample_embeddings_for_run(
-            td, run_id="lemon-2", timestamp="2025-08-03T00:00:00+00:00"
+            td, run_id="lemon-2", embedding_timestamp="2025-08-03T00:00:00+00:00"
         )
     )
 
diff --git a/tests/utils.py b/tests/utils.py
@@ -68,7 +68,7 @@ def generate_sample_embeddings(
     embedding_model: str | None = "super-org/amazing-model",
     embedding_strategy: str | None = "full_record",
     run_id: str | None = None,
-    timestamp: str | None = "2024-12-01T00:00:00+00:00",
+    embedding_timestamp: str | None = "2024-12-01T00:00:00+00:00",
 ) -> Iterator[DatasetEmbedding]:
     """Generate sample DatasetEmbeddings."""
     if not run_id:
@@ -90,7 +90,7 @@ def generate_sample_embeddings(
             run_record_offset=x,
             embedding_model=embedding_model,
             embedding_strategy=embedding_strategy,
-            timestamp=timestamp,
+            embedding_timestamp=embedding_timestamp,
             embedding_vector=embedding_vector,
             embedding_object=embedding_object,
         )
@@ -101,7 +101,7 @@ def generate_sample_embeddings_for_run(
     run_id: str,
     embedding_model: str | None = "super-org/amazing-model",
     embedding_strategy: str | None = "full_record",
-    timestamp: str | None = None,
+    embedding_timestamp: str | None = None,
     embedding_dimensions: int = 3,
 ) -> Iterator[DatasetEmbedding]:
     """Generate sample DatasetEmbeddings for a given ETL run."""
@@ -114,8 +114,8 @@ def generate_sample_embeddings_for_run(
     """
     ).to_df()
 
-    if not timestamp:
-        timestamp = records_metadata.iloc[0].run_timestamp.isoformat()
+    if not embedding_timestamp:
+        embedding_timestamp = records_metadata.iloc[0].run_timestamp.isoformat()
 
     for _idx, record in records_metadata.iterrows():
         embedding_vector = [random.random() for _ in range(embedding_dimensions)]
@@ -131,5 +131,5 @@ def generate_sample_embeddings_for_run(
             embedding_strategy=embedding_strategy,
             embedding_vector=embedding_vector,
             embedding_object=embedding_object,
-            timestamp=timestamp,
+            embedding_timestamp=embedding_timestamp,
         )
diff --git a/timdex_dataset_api/embeddings.py b/timdex_dataset_api/embeddings.py
@@ -31,7 +31,7 @@
         pa.field("timdex_record_id", pa.string()),
         pa.field("run_id", pa.string()),
         pa.field("run_record_offset", pa.int32()),
-        pa.field("timestamp", pa.timestamp("us", tz="UTC")),
+        pa.field("embedding_timestamp", pa.timestamp("us", tz="UTC")),
         pa.field("embedding_model", pa.string()),
         pa.field("embedding_strategy", pa.string()),
         pa.field("embedding_vector", pa.list_(pa.float32())),
@@ -47,7 +47,7 @@
     "timdex_record_id",
     "run_id",
     "run_record_offset",
-    "timestamp",
+    "embedding_timestamp",
     "embedding_model",
     "embedding_strategy",
 }
@@ -67,7 +67,7 @@ class EmbeddingsFilters(TypedDict, total=False):
     timdex_record_id: str
     run_id: str
     run_record_offset: int
-    timestamp: str | datetime
+    embedding_timestamp: str | datetime
     embedding_model: str
     embedding_strategy: str
     # record metadata columns
@@ -92,7 +92,7 @@ class DatasetEmbedding:
         embedding_strategy: Strategy used to create embedding
             - this correlates to a transformation strategy in the timdex-embeddings CLI
             application, e.g. "full_record"
-        timestamp: Timestamp when embedding was created
+        embedding_timestamp: Timestamp when embedding was created
         embedding_vector: Numerical vector representation of embedding
             - preferred form for storing embedding as a numerical array
         embedding_object: Object representation of the embedding
@@ -105,7 +105,7 @@ class DatasetEmbedding:
     run_record_offset: int = field()
     embedding_model: str = field()
     embedding_strategy: str = field()
-    timestamp: datetime = field(  # type: ignore[assignment]
+    embedding_timestamp: datetime = field(  # type: ignore[assignment]
         converter=datetime_iso_parse,
         default=attrs.Factory(lambda: datetime.now(tz=UTC).isoformat()),
     )
@@ -114,15 +114,15 @@ class DatasetEmbedding:
 
     @property
     def year(self) -> str:
-        return self.timestamp.strftime("%Y")
+        return self.embedding_timestamp.strftime("%Y")
 
     @property
     def month(self) -> str:
-        return self.timestamp.strftime("%m")
+        return self.embedding_timestamp.strftime("%m")
 
     @property
     def day(self) -> str:
-        return self.timestamp.strftime("%d")
+        return self.embedding_timestamp.strftime("%d")
 
     def to_dict(
         self,
@@ -232,15 +232,15 @@ def _create_current_embeddings_view(self, conn: DuckDBPyConnection) -> None:
             create or replace view data.current_embeddings as
             (
                 with
-                    -- CTE of embeddings ranked by timestamp
+                    -- CTE of embeddings ranked by embedding_timestamp
                     ce_ranked_embeddings as
                     (
                         select
                             *,
                             row_number() over (
                                 partition by timdex_record_id, embedding_strategy
                                 order by
-                                    timestamp desc nulls last,
+                                    embedding_timestamp desc nulls last,
                                     run_record_offset desc nulls last
                             ) as rn
                         from data.embeddings
@@ -269,15 +269,15 @@ def _create_current_run_embeddings_view(self, conn: DuckDBPyConnection) -> None:
             create or replace view data.current_run_embeddings as
             (
                 with
-                    -- CTE of embeddings ranked by timestamp
+                    -- CTE of embeddings ranked by embedding_timestamp
                     ce_ranked_embeddings as
                     (
                         select
                             *,
                             row_number() over (
                                 partition by timdex_record_id, run_id, embedding_strategy
                                 order by
-                                    timestamp desc nulls last,
+                                    embedding_timestamp desc nulls last,
                                     run_id desc nulls last,
                                     run_record_offset desc nulls last
                             ) as rn