Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ I/O
- Fixed :func:`read_json` with ``lines=True`` and ``chunksize`` to respect ``nrows``
when the requested row count is not a multiple of the chunk size (:issue:`64025`)
- Fixed :func:`read_json` with ``lines=True`` and ``nrows=0`` to return an empty DataFrame (:issue:`64025`)
- Fixed bug in :meth:`HDFStore.put` where string extension dtype columns raised errors when using compression (:issue:`64180`)

Period
^^^^^^
Expand Down
132 changes: 68 additions & 64 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -3271,10 +3271,6 @@ def write_array_empty(self, key: str, value: ArrayLike) -> None:
def write_array(
self, key: str, obj: AnyArrayLike, items: Index | None = None
) -> None:
# TODO: we only have a few tests that get here, the only EA
# that gets passed is DatetimeArray, and we never have
# both self._filters and EA

value = extract_array(obj, extract_numpy=True)

if key in self.group:
Expand All @@ -3295,71 +3291,79 @@ def write_array(
value = value.T
transposed = True

atom = None
if self._filters is not None:
with suppress(ValueError):
# get the atom for this datatype
atom = _tables().Atom.from_dtype(value.dtype)

if atom is not None:
# We only get here if self._filters is non-None and
# the Atom.from_dtype call succeeded

# create an empty chunked array and fill it from value
if not empty_array:
ca = self._handle.create_carray(
self.group, key, atom, value.shape, filters=self._filters
)
ca[:] = value

else:
self.write_array_empty(key, value)

elif value.dtype.type == np.object_:
# infer the type, warn if we have a non-string type here (for
# performance)
inferred_type = lib.infer_dtype(value, skipna=False)
if empty_array:
pass
elif inferred_type == "string":
pass
elif get_option("performance_warnings"):
ws = performance_doc % (inferred_type, key, items)
warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())

vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
vlarr.append(value)

elif lib.is_np_dtype(value.dtype, "M"):
self._handle.create_array(self.group, key, value.view("i8"))
getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
elif isinstance(value.dtype, DatetimeTZDtype):
# store as UTC
# with a zone

# error: "ExtensionArray" has no attribute "asi8"
self._handle.create_array(
self.group,
key,
value.asi8, # type: ignore[attr-defined]
if isinstance(value, BaseStringArray):
# GH#64180: BaseStringArray must use the VLArray path.
# Atom.from_dtype does not handle ExtensionDtype.
vlarr = self._handle.create_vlarray(
self.group, key, _tables().ObjectAtom(), filters=self._filters
)

node = getattr(self.group, key)
# error: "ExtensionArray" has no attribute "tz"
node._v_attrs.tz = _get_tz(value.tz) # type: ignore[attr-defined]
node._v_attrs.value_type = f"datetime64[{value.dtype.unit}]"
elif lib.is_np_dtype(value.dtype, "m"):
self._handle.create_array(self.group, key, value.view("i8"))
getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
elif isinstance(value, BaseStringArray):
vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
vlarr.append(value.to_numpy())
node = getattr(self.group, key)
node._v_attrs.value_type = str(value.dtype)
elif empty_array:
self.write_array_empty(key, value)

else:
self._handle.create_array(self.group, key, value)
atom = None
if self._filters is not None:
with suppress(ValueError):
# get the atom for this datatype
atom = _tables().Atom.from_dtype(value.dtype)

if atom is not None:
# We only get here if self._filters is non-None and
# the Atom.from_dtype call succeeded

# create an empty chunked array and fill it from value
if not empty_array:
ca = self._handle.create_carray(
self.group, key, atom, value.shape, filters=self._filters
)
ca[:] = value

else:
self.write_array_empty(key, value)

elif value.dtype.type == np.object_:
# infer the type, warn if we have a non-string type here
# (for performance)
inferred_type = lib.infer_dtype(value, skipna=False)
if empty_array:
pass
elif inferred_type == "string":
pass
elif get_option("performance_warnings"):
ws = performance_doc % (inferred_type, key, items)
warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())

vlarr = self._handle.create_vlarray(
self.group, key, _tables().ObjectAtom()
)
vlarr.append(value)

elif lib.is_np_dtype(value.dtype, "M"):
self._handle.create_array(self.group, key, value.view("i8"))
getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
elif isinstance(value.dtype, DatetimeTZDtype):
# store as UTC
# with a zone

# error: "ExtensionArray" has no attribute "asi8"
self._handle.create_array(
self.group,
key,
value.asi8, # type: ignore[attr-defined]
)

node = getattr(self.group, key)
# error: "ExtensionArray" has no attribute "tz"
node._v_attrs.tz = _get_tz(value.tz) # type: ignore[attr-defined]
node._v_attrs.value_type = f"datetime64[{value.dtype.unit}]"
elif lib.is_np_dtype(value.dtype, "m"):
self._handle.create_array(self.group, key, value.view("i8"))
getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
elif empty_array:
self.write_array_empty(key, value)
else:
self._handle.create_array(self.group, key, value)

getattr(self.group, key)._v_attrs.transposed = transposed

Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/io/pytables/test_put.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,18 @@ def test_put_str_series(temp_hdfstore, performance_warning, string_dtype_argumen
tm.assert_series_equal(result, expected)


def test_put_str_frame_complevel(temp_hdfstore, string_dtype_arguments):
# GH#64180 - writing StringDtype columns to HDFStore with fixed format + complevel
dtype = pd.StringDtype(*string_dtype_arguments)
df = DataFrame({"a": pd.array(["x", pd.NA, "y"], dtype=dtype), "b": [1, 2, 3]})
temp_hdfstore.put("df", df, complevel=1)
expected_dtype = "str" if dtype.na_value is np.nan else "string"
expected = df.copy()
expected["a"] = expected["a"].astype(expected_dtype)
result = temp_hdfstore.get("df")
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("format", ["table", "fixed"])
@pytest.mark.parametrize(
"index",
Expand Down
Loading