Skip to content

Commit b125772

Browse files
committed
Rename timestamp to embeddings_timestamp
Why these changes are being introduced: A PR discussion pointed out that `timestamp` was the only column in the new embeddings schema that, for columns unique to this schema, doesn't prefix with `embeddings_`. There was some discussion that for any of those columns it's implied they are part of the embeddings schema so the `embeddings_` prefix is not necessary. This would hold true in a pure SQL world with tables and views acting like namespaces. But at the level of this python library, where we support quite a bit of kwargs filtering of read methods, that kind of namespacing is awkward. We would like to be able to filter by `run_timestamp` or `embeddings_timestamp`, or in the future maybe `fulltext_timestamp`. Being verbose with our columns names simplifies this a bit. How this addresses that need: Renames the column `timestamp` to `embedding_timestamp` in the embeddings schema. Side effects of this change: * The CLI timdex-embeddings will need an update to what columns it writes to Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-143 * https://mitlibraries.atlassian.net/browse/USE-288
1 parent e498931 commit b125772

File tree

3 files changed

+29
-29
lines changed

3 files changed

+29
-29
lines changed

tests/test_embeddings.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def test_dataset_embedding_init():
3131
"run_record_offset": 0,
3232
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
3333
"embedding_strategy": "full_record",
34-
"timestamp": "2024-12-01T10:00:00+00:00",
34+
"embedding_timestamp": "2024-12-01T10:00:00+00:00",
3535
"embedding_vector": [0.1, 0.2, 0.3],
3636
"embedding_object": json.dumps(
3737
{"token1": 0.1, "token2": 0.2, "token3": 0.3}
@@ -41,7 +41,7 @@ def test_dataset_embedding_init():
4141

4242
assert embedding
4343
assert embedding.timdex_record_id == "alma:123"
44-
assert embedding.timestamp == datetime(2024, 12, 1, 10, 0, tzinfo=UTC)
44+
assert embedding.embedding_timestamp == datetime(2024, 12, 1, 10, 0, tzinfo=UTC)
4545
assert embedding.embedding_object == b'{"token1": 0.1, "token2": 0.2, "token3": 0.3}'
4646

4747

@@ -52,7 +52,7 @@ def test_dataset_embedding_date_properties():
5252
run_record_offset=0,
5353
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
5454
embedding_strategy="full_record",
55-
timestamp="2024-12-01T10:00:00+00:00",
55+
embedding_timestamp="2024-12-01T10:00:00+00:00",
5656
embedding_vector=[0.1, 0.2, 0.3],
5757
)
5858

@@ -66,7 +66,7 @@ def test_dataset_embedding_to_dict():
6666
"run_record_offset": 0,
6767
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
6868
"embedding_strategy": "full_record",
69-
"timestamp": "2024-12-01T10:00:00+00:00",
69+
"embedding_timestamp": "2024-12-01T10:00:00+00:00",
7070
"embedding_vector": [0.1, 0.2, 0.3],
7171
"embedding_object": None,
7272
}
@@ -352,14 +352,14 @@ def test_current_embeddings_view_handles_duplicate_run_embeddings(
352352
# first embeddings run for run "lemon-2"
353353
td.embeddings.write(
354354
generate_sample_embeddings_for_run(
355-
td, run_id="lemon-2", timestamp="2025-08-02T00:00:00+00:00"
355+
td, run_id="lemon-2", embedding_timestamp="2025-08-02T00:00:00+00:00"
356356
)
357357
)
358358

359359
# second embeddings run for run "lemon-2" with a later timestamp
360360
td.embeddings.write(
361361
generate_sample_embeddings_for_run(
362-
td, run_id="lemon-2", timestamp="2025-08-03T00:00:00+00:00"
362+
td, run_id="lemon-2", embedding_timestamp="2025-08-03T00:00:00+00:00"
363363
)
364364
)
365365

@@ -372,7 +372,7 @@ def test_current_embeddings_view_handles_duplicate_run_embeddings(
372372
assert len(all_lemon_2) == 10
373373

374374
# verify both timestamps exist
375-
unique_timestamps = all_lemon_2["timestamp"].unique()
375+
unique_timestamps = all_lemon_2["embedding_timestamp"].unique()
376376
assert len(unique_timestamps) == 2
377377

378378
# query current_embeddings for lemon source
@@ -392,8 +392,8 @@ def test_current_embeddings_view_handles_duplicate_run_embeddings(
392392
assert (lemon_2_result["run_date"] == date(2025, 8, 2)).all()
393393

394394
# all lemon-2 current embeddings should have the later embedding timestamp
395-
max_timestamp = all_lemon_2["timestamp"].max()
396-
assert (lemon_2_result["timestamp"] == max_timestamp).all()
395+
max_timestamp = all_lemon_2["embedding_timestamp"].max()
396+
assert (lemon_2_result["embedding_timestamp"] == max_timestamp).all()
397397

398398

399399
def test_embeddings_view_includes_all_embeddings(timdex_dataset_for_embeddings_views):
@@ -405,14 +405,14 @@ def test_embeddings_view_includes_all_embeddings(timdex_dataset_for_embeddings_v
405405
# write embeddings for lemon-2 (first time) with explicit timestamp
406406
td.embeddings.write(
407407
generate_sample_embeddings_for_run(
408-
td, run_id="lemon-2", timestamp="2025-08-02T00:00:00+00:00"
408+
td, run_id="lemon-2", embedding_timestamp="2025-08-02T00:00:00+00:00"
409409
)
410410
)
411411

412412
# write embeddings for lemon-2 again with later timestamp
413413
td.embeddings.write(
414414
generate_sample_embeddings_for_run(
415-
td, run_id="lemon-2", timestamp="2025-08-03T00:00:00+00:00"
415+
td, run_id="lemon-2", embedding_timestamp="2025-08-03T00:00:00+00:00"
416416
)
417417
)
418418

tests/utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def generate_sample_embeddings(
6868
embedding_model: str | None = "super-org/amazing-model",
6969
embedding_strategy: str | None = "full_record",
7070
run_id: str | None = None,
71-
timestamp: str | None = "2024-12-01T00:00:00+00:00",
71+
embedding_timestamp: str | None = "2024-12-01T00:00:00+00:00",
7272
) -> Iterator[DatasetEmbedding]:
7373
"""Generate sample DatasetEmbeddings."""
7474
if not run_id:
@@ -90,7 +90,7 @@ def generate_sample_embeddings(
9090
run_record_offset=x,
9191
embedding_model=embedding_model,
9292
embedding_strategy=embedding_strategy,
93-
timestamp=timestamp,
93+
embedding_timestamp=embedding_timestamp,
9494
embedding_vector=embedding_vector,
9595
embedding_object=embedding_object,
9696
)
@@ -101,7 +101,7 @@ def generate_sample_embeddings_for_run(
101101
run_id: str,
102102
embedding_model: str | None = "super-org/amazing-model",
103103
embedding_strategy: str | None = "full_record",
104-
timestamp: str | None = None,
104+
embedding_timestamp: str | None = None,
105105
embedding_dimensions: int = 3,
106106
) -> Iterator[DatasetEmbedding]:
107107
"""Generate sample DatasetEmbeddings for a given ETL run."""
@@ -114,8 +114,8 @@ def generate_sample_embeddings_for_run(
114114
"""
115115
).to_df()
116116

117-
if not timestamp:
118-
timestamp = records_metadata.iloc[0].run_timestamp.isoformat()
117+
if not embedding_timestamp:
118+
embedding_timestamp = records_metadata.iloc[0].run_timestamp.isoformat()
119119

120120
for _idx, record in records_metadata.iterrows():
121121
embedding_vector = [random.random() for _ in range(embedding_dimensions)]
@@ -131,5 +131,5 @@ def generate_sample_embeddings_for_run(
131131
embedding_strategy=embedding_strategy,
132132
embedding_vector=embedding_vector,
133133
embedding_object=embedding_object,
134-
timestamp=timestamp,
134+
embedding_timestamp=embedding_timestamp,
135135
)

timdex_dataset_api/embeddings.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
pa.field("timdex_record_id", pa.string()),
3232
pa.field("run_id", pa.string()),
3333
pa.field("run_record_offset", pa.int32()),
34-
pa.field("timestamp", pa.timestamp("us", tz="UTC")),
34+
pa.field("embedding_timestamp", pa.timestamp("us", tz="UTC")),
3535
pa.field("embedding_model", pa.string()),
3636
pa.field("embedding_strategy", pa.string()),
3737
pa.field("embedding_vector", pa.list_(pa.float32())),
@@ -47,7 +47,7 @@
4747
"timdex_record_id",
4848
"run_id",
4949
"run_record_offset",
50-
"timestamp",
50+
"embedding_timestamp",
5151
"embedding_model",
5252
"embedding_strategy",
5353
}
@@ -67,7 +67,7 @@ class EmbeddingsFilters(TypedDict, total=False):
6767
timdex_record_id: str
6868
run_id: str
6969
run_record_offset: int
70-
timestamp: str | datetime
70+
embedding_timestamp: str | datetime
7171
embedding_model: str
7272
embedding_strategy: str
7373
# record metadata columns
@@ -92,7 +92,7 @@ class DatasetEmbedding:
9292
embedding_strategy: Strategy used to create embedding
9393
- this correlates to a transformation strategy in the timdex-embeddings CLI
9494
application, e.g. "full_record"
95-
timestamp: Timestamp when embedding was created
95+
embedding_timestamp: Timestamp when embedding was created
9696
embedding_vector: Numerical vector representation of embedding
9797
- preferred form for storing embedding as a numerical array
9898
embedding_object: Object representation of the embedding
@@ -105,7 +105,7 @@ class DatasetEmbedding:
105105
run_record_offset: int = field()
106106
embedding_model: str = field()
107107
embedding_strategy: str = field()
108-
timestamp: datetime = field( # type: ignore[assignment]
108+
embedding_timestamp: datetime = field( # type: ignore[assignment]
109109
converter=datetime_iso_parse,
110110
default=attrs.Factory(lambda: datetime.now(tz=UTC).isoformat()),
111111
)
@@ -114,15 +114,15 @@ class DatasetEmbedding:
114114

115115
@property
116116
def year(self) -> str:
117-
return self.timestamp.strftime("%Y")
117+
return self.embedding_timestamp.strftime("%Y")
118118

119119
@property
120120
def month(self) -> str:
121-
return self.timestamp.strftime("%m")
121+
return self.embedding_timestamp.strftime("%m")
122122

123123
@property
124124
def day(self) -> str:
125-
return self.timestamp.strftime("%d")
125+
return self.embedding_timestamp.strftime("%d")
126126

127127
def to_dict(
128128
self,
@@ -232,15 +232,15 @@ def _create_current_embeddings_view(self, conn: DuckDBPyConnection) -> None:
232232
create or replace view data.current_embeddings as
233233
(
234234
with
235-
-- CTE of embeddings ranked by timestamp
235+
-- CTE of embeddings ranked by embedding_timestamp
236236
ce_ranked_embeddings as
237237
(
238238
select
239239
*,
240240
row_number() over (
241241
partition by timdex_record_id, embedding_strategy
242242
order by
243-
timestamp desc nulls last,
243+
embedding_timestamp desc nulls last,
244244
run_record_offset desc nulls last
245245
) as rn
246246
from data.embeddings
@@ -269,15 +269,15 @@ def _create_current_run_embeddings_view(self, conn: DuckDBPyConnection) -> None:
269269
create or replace view data.current_run_embeddings as
270270
(
271271
with
272-
-- CTE of embeddings ranked by timestamp
272+
-- CTE of embeddings ranked by embedding_timestamp
273273
ce_ranked_embeddings as
274274
(
275275
select
276276
*,
277277
row_number() over (
278278
partition by timdex_record_id, run_id, embedding_strategy
279279
order by
280-
timestamp desc nulls last,
280+
embedding_timestamp desc nulls last,
281281
run_id desc nulls last,
282282
run_record_offset desc nulls last
283283
) as rn

0 commit comments

Comments
 (0)