Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion modin/core/storage_formats/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4627,12 +4627,13 @@ def write_items(df, broadcasted_items):
return df

if not is_scalar(item):
broadcasted_item, _ = broadcast_item(
broadcasted_item, _, _, _ = broadcast_item(
self,
row_numeric_index,
col_numeric_index,
item,
need_columns_reindex=need_columns_reindex,
sort_lookups_and_item=False,
)
else:
broadcasted_item = item
Expand Down
7 changes: 6 additions & 1 deletion modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4704,7 +4704,12 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item):
return partition

if not is_scalar(item):
broadcasted_item, broadcasted_dtypes = broadcast_item(
(
broadcasted_item,
broadcasted_dtypes,
row_numeric_index,
col_numeric_index,
) = broadcast_item(
self,
row_numeric_index,
col_numeric_index,
Expand Down
66 changes: 61 additions & 5 deletions modin/pandas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from __future__ import annotations

from typing import Iterator, Optional, Tuple
from typing import Any, Iterator, Optional, Tuple

import numpy as np
import pandas
Expand Down Expand Up @@ -221,7 +221,8 @@
row_lookup,
col_lookup,
item,
need_columns_reindex=True,
need_columns_reindex: bool = True,
sort_lookups_and_item: bool = True,
):
"""
Use NumPy to broadcast or reshape item with reindexing.
Expand All @@ -239,12 +240,17 @@
need_columns_reindex : bool, default: True
In the case of assigning columns to a dataframe (broadcasting is
part of the flow), reindexing is not needed.
sort_lookups_and_item : bool, default: True
If set, sort the lookups in ascending order and the item to match. This is necessary to
ensure writes across multiple partitions are ordered correctly when the lookups are unsorted.

Returns
-------
(np.ndarray, Optional[Series])
(np.ndarray, Optional[Series], array-like, array-like)
* np.ndarray - `item` after it was broadcasted to `to_shape`.
* Series - item's dtypes.
* array-like - sorted version of `row_lookup` (may or may not be the same reference)
* array-like - sorted version of `col_lookup` (may or may not be the same reference)

Raises
------
Expand All @@ -259,6 +265,7 @@
"""
# It is valid to pass a DataFrame or Series to __setitem__ that is larger than
# the target the user is trying to overwrite.

from .dataframe import DataFrame
from .series import Series

Expand Down Expand Up @@ -295,12 +302,61 @@
# Cast to numpy drop information about heterogeneous types (cast to common)
# TODO: we shouldn't do that, maybe there should be the if branch
item = np.array(item)

def sort_index(lookup: Any) -> np.ndarray:
"""
Return the argsort and sorted version of the lookup index.

Values in the lookup are guaranteed by the indexing frontend to be non-negative.

The sort operation must be stable to ensure proper behavior for iloc set, which
will use the last item encountered if two items share an index.
"""
if isinstance(lookup, slice):
# Special case for if a descending slice is passed
# Directly calling np.array(slice(...)) does not work
lookup = range(lookup.start or 0, lookup.stop or 0, lookup.step or 0)

Check warning on line 318 in modin/pandas/utils.py

View check run for this annotation

Codecov / codecov/patch

modin/pandas/utils.py#L318

Added line #L318 was not covered by tests
argsort_index = np.argsort(lookup, kind="stable")
return argsort_index, np.array(lookup)[argsort_index]

def should_avoid_sort(lookup: Any) -> bool:
return (
not sort_lookups_and_item
or (
isinstance(lookup, (range, pandas.RangeIndex, slice))
and lookup.step is not None
and lookup.step > 0
)
or (isinstance(lookup, slice) and lookup == slice(None))
)

# Fast path to avoid sorting for range/RangeIndex, which are already sorted, or the empty slice
avoid_row_lookup_sort = should_avoid_sort(row_lookup)
avoid_col_lookup_sort = should_avoid_sort(col_lookup)
# Sort both the columns and rows if necessary
if item.ndim >= 2:
if avoid_row_lookup_sort:
if not avoid_col_lookup_sort:
col_argsort, col_lookup = sort_index(col_lookup)
item = item[:, col_argsort]
elif avoid_col_lookup_sort:
row_argsort, row_lookup = sort_index(row_lookup)
item = item[row_argsort, :]
else:
row_argsort, row_lookup = sort_index(row_lookup)
col_argsort, col_lookup = sort_index(col_lookup)
# Use np.ix_ to handle broadcasting errors
item = item[np.ix_(row_argsort, col_argsort)]
elif not avoid_row_lookup_sort:
# Item is 1D, so only sort row indexer
row_argsort, row_lookup = sort_index(row_lookup)
item = item[row_argsort]
if dtypes is None:
dtypes = pandas.Series([item.dtype] * len(col_lookup))
if np.prod(to_shape) == np.prod(item.shape):
return item.reshape(to_shape), dtypes
return item.reshape(to_shape), dtypes, row_lookup, col_lookup
else:
return np.broadcast_to(item, to_shape), dtypes
return np.broadcast_to(item, to_shape), dtypes, row_lookup, col_lookup
except ValueError:
from_shape = np.array(item).shape
raise ValueError(
Expand Down
74 changes: 74 additions & 0 deletions modin/tests/pandas/dataframe/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2693,3 +2693,77 @@ def test_index_of_empty_frame():

assert md_res.empty and pd_res.empty
df_equals(md_res.index, pd_res.index)


# https://github.com/modin-project/modin/issues/7405
@pytest.mark.parametrize("indexer", ["loc", "iloc"])
def test_loc_and_iloc_set_order(indexer):
rng = np.random.default_rng(seed=0)
is_loc = indexer == "loc"
data = {"col": rng.integers(0, 100, size=100)}
set_count = 20
# Pick a bunch of unsorted row indices; may contain repeat values.
row_indexer = rng.integers(0, 100, size=set_count)
col_indexer = "col" if is_loc else 0
set_data = range(100, 100 + set_count)
md_df, pd_df = create_test_dfs(data)

def get_helper(df):
if is_loc:
return df.loc[row_indexer, col_indexer]
else:
return df.iloc[row_indexer, col_indexer]

# First, ensure loc/iloc read succeeds.
eval_general(md_df, pd_df, get_helper)

def set_helper(df):
if is_loc:
df.loc[row_indexer, col_indexer] = set_data
else:
df.iloc[row_indexer, col_indexer] = set_data

# Second, check results of loc/iloc write.
eval_general(
md_df,
pd_df,
set_helper,
__inplace__=True,
)
# Finally, check the result of a loc/iloc read again.
eval_general(md_df, pd_df, get_helper)


def test_iloc_set_negative_index():
rng = np.random.default_rng(seed=0)
row_count = 50
col_count = 80
data = {f"col_{i}": rng.integers(0, 100, size=row_count) for i in range(col_count)}
row_set_count = 20
col_set_count = 30
# Pick a bunch of unsorted row indices; may contain repeat values and negative numbers.
row_indexer = rng.integers(-row_count, row_count, size=row_set_count)
col_indexer = rng.integers(-col_count, col_count, size=col_set_count)
set_data = np.reshape(
range(100, 100 + row_set_count * col_set_count), (row_set_count, col_set_count)
)
md_df, pd_df = create_test_dfs(data)

def get_helper(df):
return df.iloc[row_indexer, col_indexer]

# First, ensure loc/iloc read succeeds.
eval_general(md_df, pd_df, get_helper)

def set_helper(df):
df.iloc[row_indexer, col_indexer] = set_data

# Second, check results of loc/iloc write.
eval_general(
md_df,
pd_df,
set_helper,
__inplace__=True,
)
# Finally, check the result of a loc/iloc read again.
eval_general(md_df, pd_df, get_helper)
Loading