Skip to content

Commit eb4dfda

Browse files
Merge pull request #149 from DunklesArchipel/fix-dialect
Fix dialect
2 parents 3e8ad80 + ce18a40 commit eb4dfda

File tree

7 files changed

+266
-28
lines changed

7 files changed

+266
-28
lines changed

doc/conf.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
project = 'unitpackage'
2-
copyright = '2022-2023, the unitpackage authors'
2+
copyright = '2022-2026, the unitpackage authors'
33
author = 'the unitpackage authors'
44

55
release = '0.12.0'
@@ -43,6 +43,10 @@
4343

4444
# Ignore the link to the GNU General Public License v3.0
4545
# This is because checking results in a timeout.
46+
# Zenodo badge and record URLs are excluded because Zenodo's servers
47+
# block automated link checkers with 403 responses.
4648
linkcheck_ignore = [
4749
"https://www.gnu.org/licenses/gpl-3.0.html*",
50+
"https://zenodo.org/badge/*",
51+
"https://zenodo.org/records/*",
4852
]

doc/news/fix-dialect.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
**Fixed:**
2+
3+
* Fixed CSV-to-dataframe reconstruction from tabular resources to honor frictionless descriptor dialect and encoding metadata, avoiding silent misparsing for non-default delimiters.
4+
* Fixed the CSV loader API by replacing the ambiguous `delimiters` argument with explicit `delimiter` and `candidate_delimiters` parameters.
5+

doc/usage/load_and_save.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ For CSV files with more complex structures, additional arguments can be provided
9797
- `header_lines` — number of header lines to skip before the data
9898
- `column_header_lines` — number of lines containing column headers (multiple lines are flattened and separated by ` / `)
9999
- `decimal` — decimal separator (e.g., `','` for European-style numbers)
100-
- `delimiters` — column delimiter (auto-detected if not specified)
100+
- `delimiter` — explicit column delimiter
101+
- `candidate_delimiters` — candidate delimiters used during autodetection
101102
- `encoding` — file encoding
102103

103104
For example, a CSV with multiple header lines:
@@ -129,7 +130,7 @@ The loader automatically detects headers and delimiters. The resulting entry con
129130
entry.fields
130131
```
131132

132-
Information on the file structure is stored in the entry's metadata under `dsvDescription`:
133+
Information on detected file structure is available in entry metadata (for example under `dsvDescription`):
133134

134135
```{code-cell} ipython3
135136
entry.metadata['dsvDescription']['loader']

doc/usage/loaders.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,15 @@ Multiple column headers will be flattened.
7373
from unitpackage.loaders.baseloader import BaseLoader
7474
csv = BaseLoader(file, header_lines=6,
7575
column_header_lines=2,
76-
delimiters=None,
76+
delimiter="\t",
77+
candidate_delimiters=None,
7778
decimal=None)
7879
csv.df
7980
```
8081

82+
To pin a known delimiter, pass `delimiter='\t'` (or any other separator).
83+
To restrict autodetection, pass `candidate_delimiters=['\t', ';']`.
84+
8185
All parts of the file are accessible from the API for further use. For example the extraction of metadata from the header.
8286

8387
```{code-cell} ipython3

unitpackage/entry.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,7 +1092,8 @@ def from_csv( # pylint: disable=too-many-locals
10921092
header_lines=None,
10931093
column_header_lines=None,
10941094
decimal=None,
1095-
delimiters=None,
1095+
delimiter=None,
1096+
candidate_delimiters=None,
10961097
device=None,
10971098
):
10981099
r"""
@@ -1105,6 +1106,9 @@ def from_csv( # pylint: disable=too-many-locals
11051106
A ``device`` can be specified to select a device-specific loader
11061107
(e.g., ``'eclab'`` or ``'gamry'``).
11071108
1109+
``candidate_delimiters`` can be used to restrict delimiter sniffing to
1110+
a known set of separators.
1111+
11081112
EXAMPLES::
11091113
11101114
>>> from unitpackage.entry import Entry
@@ -1135,6 +1139,18 @@ def from_csv( # pylint: disable=too-many-locals
11351139
[{'name': 'E / V', 'type': 'integer'},
11361140
{'name': 'j / A / cm2', 'type': 'integer'}]
11371141
1142+
Candidate delimiters can be provided explicitly when parsing a file::
1143+
1144+
>>> import os
1145+
>>> import tempfile
1146+
>>> with tempfile.TemporaryDirectory() as tmpdir:
1147+
... filename = os.path.join(tmpdir, 'candidate_delimiters.csv')
1148+
... with open(filename, 'w', encoding='utf-8') as handle:
1149+
... _ = handle.write('a\tb\n1\t2\n')
1150+
... entry = Entry.from_csv(csvname=filename, candidate_delimiters=[';', '\t'])
1151+
>>> entry.metadata['dsvDescription']['delimiter']
1152+
'\t'
1153+
11381154
A device-specific loader can be used to parse instrument files::
11391155
11401156
>>> entry = Entry.from_csv(csvname='test/loader_data/eclab_cv.mpt', device='eclab')
@@ -1149,6 +1165,7 @@ def from_csv( # pylint: disable=too-many-locals
11491165
11501166
>>> entry.metadata['dsvDescription']['loader']
11511167
'ECLabLoader'
1168+
11521169
>>> entry.metadata['dsvDescription']['delimiter']
11531170
'\t'
11541171
@@ -1162,7 +1179,8 @@ def from_csv( # pylint: disable=too-many-locals
11621179
"header_lines": header_lines,
11631180
"column_header_lines": column_header_lines,
11641181
"decimal": decimal,
1165-
"delimiters": delimiters,
1182+
"delimiter": delimiter,
1183+
"candidate_delimiters": candidate_delimiters,
11661184
}
11671185

11681186
loader_cls = BaseLoader.create(device) if device else BaseLoader

unitpackage/loaders/baseloader.py

Lines changed: 161 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848

4949

5050
import logging
51+
from collections.abc import Iterable
5152

5253
logger = logging.getLogger("loader")
5354

@@ -94,21 +95,90 @@ class BaseLoader:
9495
0 2 0 0.1 0 0
9596
1 2 1 1.4 5 1
9697
98+
Candidate delimiters can be provided explicitly for autodetection.::
99+
100+
>>> from io import StringIO
101+
>>> file = StringIO('''a\tb
102+
... 0\t0
103+
... 1\t1''')
104+
>>> csv = BaseLoader(file, candidate_delimiters=[';', '\t'])
105+
>>> csv.delimiter
106+
'\t'
107+
97108
"""
98109

110+
DEFAULT_CANDIDATE_DELIMITERS = ("\t", ";", ",")
111+
DELIMITER_SNIFF_SAMPLE_LINES = 25
112+
_warned_default_candidate_delimiters = False
113+
99114
def __init__(
100115
self,
101116
file,
102117
header_lines=None,
103118
column_header_lines=None,
104119
decimal=None,
105-
delimiters=None,
120+
delimiter=None,
121+
candidate_delimiters=None,
106122
): # pylint: disable=dangerous-default-value
107123
self._file = file.read()
108124
self._header_lines = header_lines
109125
self._column_header_lines = column_header_lines
110126
self._decimal = decimal
111-
self.delimiters = delimiters or ["\t", ";", ","]
127+
self._delimiter = delimiter
128+
self._candidate_delimiters = self._normalize_delimiter_candidates(
129+
delimiter=delimiter,
130+
candidate_delimiters=candidate_delimiters,
131+
)
132+
133+
@staticmethod
134+
def _normalize_delimiter_candidates(delimiter=None, candidate_delimiters=None):
135+
r"""Return delimiter candidates normalized to a list of strings.
136+
137+
The public API separates the explicit delimiter from sniffing candidates:
138+
139+
- ``delimiter=","`` fixes the delimiter to a single value.
140+
- ``candidate_delimiters=["\t", ";", ","]`` provides candidates for sniffing.
141+
142+
If ``delimiter`` is provided, ``candidate_delimiters`` must not be provided.
143+
144+
EXAMPLES::
145+
146+
>>> BaseLoader._normalize_delimiter_candidates(delimiter=',')
147+
[',']
148+
149+
>>> BaseLoader._normalize_delimiter_candidates(candidate_delimiters=['\t', ';'])
150+
['\t', ';']
151+
152+
>>> BaseLoader._normalize_delimiter_candidates(delimiter=',', candidate_delimiters=[';'])
153+
Traceback (most recent call last):
154+
...
155+
ValueError: Use either 'delimiter' or 'candidate_delimiters', not both.
156+
"""
157+
if delimiter is not None and candidate_delimiters is not None:
158+
raise ValueError(
159+
"Use either 'delimiter' or 'candidate_delimiters', not both."
160+
)
161+
162+
if delimiter is not None:
163+
return [delimiter]
164+
165+
if candidate_delimiters is None:
166+
if not BaseLoader._warned_default_candidate_delimiters:
167+
logger.warning(
168+
"No delimiter or candidate_delimiters were provided; using default candidate delimiters for sniffing."
169+
)
170+
BaseLoader._warned_default_candidate_delimiters = True
171+
return list(BaseLoader.DEFAULT_CANDIDATE_DELIMITERS)
172+
173+
if isinstance(candidate_delimiters, str):
174+
return [candidate_delimiters]
175+
176+
if isinstance(candidate_delimiters, Iterable):
177+
return list(candidate_delimiters)
178+
179+
raise TypeError(
180+
"'candidate_delimiters' must be a string or an iterable of strings."
181+
)
112182

113183
@property
114184
def file(self):
@@ -576,20 +646,102 @@ def delimiter(self):
576646
>>> csv.delimiter
577647
'\t'
578648
649+
Candidate delimiters are considered for sniffing even if the correct
650+
delimiter is not the first candidate::
651+
652+
>>> from io import StringIO
653+
>>> file = StringIO('''a\tb\n0\t0\n1\t1''')
654+
>>> csv = BaseLoader(file, candidate_delimiters=[';', '\t', ','])
655+
>>> csv.delimiter
656+
'\t'
657+
658+
Inconsistent field counts between column headers and data rows are
659+
reported early::
660+
661+
>>> from io import StringIO
662+
>>> file = StringIO('''a,b\n0,0\n1,1,1''')
663+
>>> csv = BaseLoader(file, delimiter=',')
664+
>>> csv.delimiter
665+
Traceback (most recent call last):
666+
...
667+
ValueError: Inconsistent number of fields detected in data line 2: expected 2 based on column headers but found 3.
668+
579669
"""
580-
# TODO:: Validate that the number of delimiters in the data lines
581-
# matches those in the column header line.
582-
# This will otherwise likely lead to erroneous loading of pandas dataframes
583-
# and requires setting the column names specifically.
584-
if len(self.delimiters) == 1:
585-
return self.delimiters[0]
670+
if self._delimiter is not None:
671+
self._validate_delimiter_consistency(self._delimiter)
672+
return self._delimiter
673+
674+
if len(self._candidate_delimiters) == 1:
675+
delimiter = self._candidate_delimiters[0]
676+
self._validate_delimiter_consistency(delimiter)
677+
return delimiter
586678

587679
import csv
588680
from io import StringIO
589681

590682
combined = StringIO(self.column_headers.getvalue() + self.data.getvalue())
683+
sample_lines = []
684+
for _ in range(self.DELIMITER_SNIFF_SAMPLE_LINES):
685+
line = combined.readline()
686+
if not line:
687+
break
688+
sample_lines.append(line)
689+
690+
sample = "".join(sample_lines)
691+
if not sample:
692+
raise ValueError("Delimiter could not be determined from an empty sample.")
693+
694+
delimiter = csv.Sniffer().sniff(sample, self._candidate_delimiters).delimiter
695+
self._validate_delimiter_consistency(delimiter)
696+
return delimiter
697+
698+
def _validate_delimiter_consistency(self, delimiter):
699+
r"""Validate that sampled data rows have the same field count as the
700+
column headers. Returns ``True`` if all sampled rows are consistent.
701+
702+
EXAMPLES::
703+
704+
>>> from io import StringIO
705+
>>> file = StringIO('''a,b\n0,0\n1,1''')
706+
>>> csv = BaseLoader(file, delimiter=',')
707+
>>> csv._validate_delimiter_consistency(',')
708+
True
709+
710+
>>> from io import StringIO
711+
>>> file = StringIO('''a,b\n0,0\n1,1,1''')
712+
>>> csv = BaseLoader(file, delimiter=',')
713+
>>> csv._validate_delimiter_consistency(',')
714+
Traceback (most recent call last):
715+
...
716+
ValueError: Inconsistent number of fields detected in data line 2: expected 2 based on column headers but found 3.
717+
718+
"""
719+
import csv
720+
721+
column_header_lines = self.column_headers.getvalue().splitlines()
722+
if not column_header_lines:
723+
return True
724+
725+
expected_fields = len(
726+
next(csv.reader([column_header_lines[0]], delimiter=delimiter))
727+
)
591728

592-
return csv.Sniffer().sniff(combined.readline(), self.delimiters).delimiter
729+
for line_number, line in enumerate(
730+
self.data.getvalue().splitlines()[: self.DELIMITER_SNIFF_SAMPLE_LINES],
731+
start=1,
732+
):
733+
if not line.strip():
734+
continue
735+
736+
actual_fields = len(next(csv.reader([line], delimiter=delimiter)))
737+
if actual_fields != expected_fields:
738+
raise ValueError(
739+
"Inconsistent number of fields detected in data line "
740+
f"{line_number}: expected {expected_fields} based on "
741+
f"column headers but found {actual_fields}."
742+
)
743+
744+
return True
593745

594746
@property
595747
def decimal(self):

0 commit comments

Comments
 (0)