11# ruff: noqa: S105, S106, SLF001, PLR2004, PD901, D209, D205
2-
3- import datetime
42import math
53import os
4+ import re
5+ from datetime import UTC , datetime
6+ from unittest .mock import patch
67
78import pyarrow .dataset as ds
89import pytest
910
11+ from tests .utils import generate_sample_records
1012from timdex_dataset_api .dataset import (
1113 MAX_ROWS_PER_FILE ,
1214 TIMDEX_DATASET_SCHEMA ,
1315 DatasetNotLoadedError ,
1416 TIMDEXDataset ,
1517)
16- from timdex_dataset_api .exceptions import InvalidDatasetRecordError
1718from timdex_dataset_api .record import DatasetRecord
1819
1920
20- def test_dataset_record_serialization ():
21+ def test_dataset_record_init ():
2122 values = {
2223 "timdex_record_id" : "alma:123" ,
2324 "source_record" : b"<record><title>Hello World.</title></record>" ,
@@ -26,38 +27,35 @@ def test_dataset_record_serialization():
2627 "run_date" : "2024-12-01" ,
2728 "run_type" : "full" ,
2829 "action" : "index" ,
29- "run_id" : "abc123 " ,
30+ "run_id" : "000-111-aaa-bbb " ,
3031 }
31- dataset_record = DatasetRecord (** values )
32- assert dataset_record .to_dict () == values
32+ record = DatasetRecord (** values )
33+ assert record
34+ assert (record .year , record .month , record .day ) == (
35+ "2024" ,
36+ "12" ,
37+ "01" ,
38+ )
3339
3440
35- def test_dataset_record_serialization_with_partition_values_provided ():
36- dataset_record = DatasetRecord (
37- timdex_record_id = "alma:123" ,
38- source_record = b"<record><title>Hello World.</title></record>" ,
39- transformed_record = b"""{"title":["Hello World."]}""" ,
40- )
41- partition_values = {
42- "source" : "alma" ,
43- "run_date" : "2024-12-01" ,
44- "run_type" : "daily" ,
45- "action" : "index" ,
46- "run_id" : "000-111-aaa-bbb" ,
47- }
48- assert dataset_record .to_dict (partition_values = partition_values ) == {
41+ def test_dataset_record_init_with_invalid_run_date_raise_error ():
42+ values = {
4943 "timdex_record_id" : "alma:123" ,
5044 "source_record" : b"<record><title>Hello World.</title></record>" ,
5145 "transformed_record" : b"""{"title":["Hello World."]}""" ,
52- "source" : "alma " ,
53- "run_date" : "2024 -12-01" ,
54- "run_type" : "daily " ,
46+ "source" : "libguides " ,
47+ "run_date" : "-12-01" ,
48+ "run_type" : "full " ,
5549 "action" : "index" ,
5650 "run_id" : "000-111-aaa-bbb" ,
5751 }
52+ with pytest .raises (
53+ ValueError , match = re .escape ("time data '-12-01' does not match format '%Y-%m-%d'" )
54+ ):
55+ DatasetRecord (** values )
5856
5957
60- def test_dataset_record_serialization_missing_partition_raise_error ():
58+ def test_dataset_record_serialization ():
6159 values = {
6260 "timdex_record_id" : "alma:123" ,
6361 "source_record" : b"<record><title>Hello World.</title></record>" ,
@@ -66,14 +64,22 @@ def test_dataset_record_serialization_missing_partition_raise_error():
6664 "run_date" : "2024-12-01" ,
6765 "run_type" : "full" ,
6866 "action" : "index" ,
69- "run_id" : None , # <------ missing partition here
67+ "run_id" : "abc123" ,
7068 }
7169 dataset_record = DatasetRecord (** values )
72- with pytest .raises (
73- InvalidDatasetRecordError ,
74- match = "Partition values are missing: run_id" ,
75- ):
76- assert dataset_record .to_dict () == values
70+ assert dataset_record .to_dict () == {
71+ "timdex_record_id" : "alma:123" ,
72+ "source_record" : b"<record><title>Hello World.</title></record>" ,
73+ "transformed_record" : b"""{"title":["Hello World."]}""" ,
74+ "source" : "libguides" ,
75+ "run_date" : datetime (2024 , 12 , 1 ).astimezone (UTC ),
76+ "run_type" : "full" ,
77+ "action" : "index" ,
78+ "run_id" : "abc123" ,
79+ "year" : "2024" ,
80+ "month" : "12" ,
81+ "day" : "01" ,
82+ }
7783
7884
7985def test_dataset_write_records_to_new_dataset (new_dataset , sample_records_iter ):
@@ -134,52 +140,6 @@ def test_dataset_write_to_multiple_locations_raise_error(sample_records_iter):
134140 timdex_dataset .write (sample_records_iter (10 ))
135141
136142
137- def test_dataset_write_mixin_partition_values_used (
138- new_dataset , sample_records_iter_without_partitions
139- ):
140- partition_values = {
141- "source" : "alma" ,
142- "run_date" : "2024-12-01" ,
143- "run_type" : "daily" ,
144- "action" : "index" ,
145- "run_id" : "000-111-aaa-bbb" ,
146- }
147- _written_files = new_dataset .write (
148- sample_records_iter_without_partitions (10 ),
149- partition_values = partition_values ,
150- )
151- new_dataset .reload ()
152-
153- # load as pandas dataframe and assert column values
154- df = new_dataset .dataset .to_table ().to_pandas ()
155- row = df .iloc [0 ]
156- assert row .source == partition_values ["source" ]
157- assert row .run_date == datetime .date (2024 , 12 , 1 )
158- assert row .run_type == partition_values ["run_type" ]
159- assert row .action == partition_values ["action" ]
160- assert row .action == partition_values ["action" ]
161-
162-
163- def test_dataset_write_schema_partitions_correctly_ordered (
164- new_dataset , sample_records_iter
165- ):
166- written_files = new_dataset .write (
167- sample_records_iter (10 ),
168- partition_values = {
169- "source" : "alma" ,
170- "run_date" : "2024-12-01" ,
171- "run_type" : "daily" ,
172- "run_id" : "000-111-aaa-bbb" ,
173- "action" : "index" ,
174- },
175- )
176- file = written_files [0 ]
177- assert (
178- "/source=alma/run_date=2024-12-01/run_type=daily"
179- "/run_id=000-111-aaa-bbb/action=index/" in file .path
180- )
181-
182-
183143def test_dataset_write_schema_applied_to_dataset (new_dataset , sample_records_iter ):
184144 new_dataset .write (sample_records_iter (10 ))
185145
@@ -194,67 +154,63 @@ def test_dataset_write_schema_applied_to_dataset(new_dataset, sample_records_ite
194154 assert set (dataset .schema .names ) == set (TIMDEX_DATASET_SCHEMA .names )
195155
196156
197- def test_dataset_write_partition_deleted_when_written_to_again (
198- new_dataset , sample_records_iter
199- ):
200- """This tests the existing_data_behavior="delete_matching" configuration when writing
201- to a dataset."""
202- partition_values = {
203- "source" : "alma" ,
204- "run_date" : "2024-12-01" ,
205- "run_type" : "daily" ,
206- "action" : "index" ,
207- "run_id" : "000-111-aaa-bbb" ,
208- }
157+ def test_dataset_write_partition_for_single_source (new_dataset , sample_records_iter ):
158+ written_files = new_dataset .write (sample_records_iter (10 ))
159+ assert len (written_files ) == 1
160+ assert os .path .exists (new_dataset .location )
161+ assert "year=2024/month=12/day=01" in written_files [0 ].path
209162
210- # perform FIRST write to run_date="2024-12-01"
211- written_files_1 = new_dataset .write (
212- sample_records_iter (10 ),
213- partition_values = partition_values ,
214- )
215163
216- # assert that files from first write are present at this time
217- assert os .path .exists (written_files_1 [0 ].path )
164+ def test_dataset_write_partition_for_multiple_sources (new_dataset , sample_records_iter ):
165+ # perform write for source="alma" and run_date="2024-12-01"
166+ written_files_source_a = new_dataset .write (sample_records_iter (10 ))
167+ new_dataset .reload ()
218168
219- # perform unrelated write with new run_date to confirm this is untouched during delete
220- new_partition_values = partition_values .copy ()
221- new_partition_values ["run_date" ] = "2024-12-15"
222- new_partition_values ["run_id" ] = "222-333-ccc-ddd"
223- written_files_x = new_dataset .write (
224- sample_records_iter (7 ),
225- partition_values = new_partition_values ,
226- )
169+ assert os .path .exists (written_files_source_a [0 ].path )
170+ assert new_dataset .row_count == 10
227171
228- # perform SECOND write to run_date="2024-12-01", expecting this to delete everything
229- # under this combination of partitions (i.e. the first write)
230- written_files_2 = new_dataset . write (
231- sample_records_iter ( 10 ),
232- partition_values = partition_values ,
172+ # perform write for source="libguides" and run_date="2024-12-01"
173+ written_files_source_b = new_dataset . write (
174+ generate_sample_records (
175+ num_records = 7 , timdex_record_id_prefix = "libguides" , source = "libguides"
176+ )
233177 )
234-
235178 new_dataset .reload ()
236179
237- # assert 17 rows: second write for run_date="2024-12-01" @ 10 rows +
238- # run_date="2024-12-15" @ 5 rows
180+ assert os . path . exists ( written_files_source_b [ 0 ]. path )
181+ assert os . path . exists ( written_files_source_a [ 0 ]. path )
239182 assert new_dataset .row_count == 17
240183
241- # assert that files from first run_date="2024-12-01" are gone, second exist
242- # and files from run_date="2024-12-15" also exist
243- assert not os .path .exists (written_files_1 [0 ].path )
244- assert os .path .exists (written_files_2 [0 ].path )
245- assert os .path .exists (written_files_x [0 ].path )
246184
185+ def test_dataset_write_partition_ignore_existing_data (new_dataset , sample_records_iter ):
186+ # perform two (2) writes for source="alma" and run_date="2024-12-01"
187+ written_files_source_a0 = new_dataset .write (sample_records_iter (10 ))
188+ written_files_source_a1 = new_dataset .write (sample_records_iter (10 ))
189+ new_dataset .reload ()
247190
248- def test_dataset_write_missing_partitions_raise_error (new_dataset , sample_records_iter ):
249- missing_partition_values = {
250- "source" : "libguides" ,
251- "run_date" : None ,
252- "run_type" : None ,
253- "action" : None ,
254- "run_id" : None ,
255- }
256- with pytest .raises (InvalidDatasetRecordError , match = "Partition values are missing" ):
257- _ = new_dataset .write (
258- sample_records_iter (10 ),
259- partition_values = missing_partition_values ,
260- )
191+ # assert that both files exist and no overwriting occurs
192+ assert os .path .exists (written_files_source_a0 [0 ].path )
193+ assert os .path .exists (written_files_source_a1 [0 ].path )
194+ assert new_dataset .row_count == 20
195+
196+
197+ @patch ("timdex_dataset_api.dataset.uuid.uuid4" )
198+ def test_dataset_write_partition_overwrite_files_with_same_name (
199+ mock_uuid , new_dataset , sample_records_iter
200+ ):
201+ """This test is to demonstrate existing_data_behavior="overwrite_or_ignore".
202+
203+ It is extremely unlikely for the uuid.uuid4 method to generate duplicate values,
204+ so for testing purposes, this method is patched to return the same value
205+ and therefore generate similarly named files.
206+ """
207+ mock_uuid .return_value = "abc"
208+
209+ # perform two (2) writes for source="alma" and run_date="2024-12-01"
210+ _ = new_dataset .write (sample_records_iter (10 ))
211+ written_files_source_a1 = new_dataset .write (sample_records_iter (7 ))
212+ new_dataset .reload ()
213+
214+ # assert that only the second file exists and overwriting occurs
215+ assert os .path .exists (written_files_source_a1 [0 ].path )
216+ assert new_dataset .row_count == 7
0 commit comments