modin-project · sfc-gh-joshi · Feb 27, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 13, 2025
@@ -4627,12 +4627,13 @@ def write_items(df, broadcasted_items):
             return df
 
         if not is_scalar(item):
-            broadcasted_item, _ = broadcast_item(
+            broadcasted_item, _, _, _ = broadcast_item(
                 self,
                 row_numeric_index,
                 col_numeric_index,
                 item,
                 need_columns_reindex=need_columns_reindex,
+                sort_lookups_and_item=False,
             )
         else:
             broadcasted_item = item

@@ -4704,7 +4704,12 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, item):
             return partition
 
         if not is_scalar(item):
-            broadcasted_item, broadcasted_dtypes = broadcast_item(
+            (
+                broadcasted_item,
+                broadcasted_dtypes,
+                row_numeric_index,
+                col_numeric_index,
+            ) = broadcast_item(
                 self,
                 row_numeric_index,
                 col_numeric_index,

@@ -15,7 +15,7 @@
 
 from __future__ import annotations
 
-from typing import Iterator, Optional, Tuple
+from typing import Any, Iterator, Optional, Tuple
 
 import numpy as np
 import pandas
@@ -221,7 +221,8 @@
     row_lookup,
     col_lookup,
     item,
-    need_columns_reindex=True,
+    need_columns_reindex: bool = True,
+    sort_lookups_and_item: bool = True,
 ):
     """
     Use NumPy to broadcast or reshape item with reindexing.
@@ -239,12 +240,17 @@
     need_columns_reindex : bool, default: True
         In the case of assigning columns to a dataframe (broadcasting is
         part of the flow), reindexing is not needed.
+    sort_lookups_and_item : bool, default: True
+        If set, sort the lookups in ascending order and the item to match. This is necessary to
+        ensure writes across multiple partitions are ordered correctly when the lookups are unsorted.
 
     Returns
     -------
-    (np.ndarray, Optional[Series])
+    (np.ndarray, Optional[Series], array-like, array-like)
         * np.ndarray - `item` after it was broadcasted to `to_shape`.
         * Series - item's dtypes.
+        * array-like - sorted version of `row_lookup` (may or may not be the same reference)
+        * array-like - sorted version of `col_lookup` (may or may not be the same reference)
 
     Raises
     ------
@@ -259,6 +265,7 @@
     """
     # It is valid to pass a DataFrame or Series to __setitem__ that is larger than
     # the target the user is trying to overwrite.
+
     from .dataframe import DataFrame
     from .series import Series
 
@@ -295,12 +302,61 @@
         # Cast to numpy drop information about heterogeneous types (cast to common)
         # TODO: we shouldn't do that, maybe there should be the if branch
         item = np.array(item)
+
+        def sort_index(lookup: Any) -> np.ndarray:
+            """
+            Return the argsort and sorted version of the lookup index.
+
+            Values in the lookup are guaranteed by the indexing frontend to be non-negative.
+
+            The sort operation must be stable to ensure proper behavior for iloc set, which
+            will use the last item encountered if two items share an index.
+            """
+            if isinstance(lookup, slice):
+                # Special case for if a descending slice is passed
+                # Directly calling np.array(slice(...)) does not work
+                lookup = range(lookup.start or 0, lookup.stop or 0, lookup.step or 0)
+            argsort_index = np.argsort(lookup, kind="stable")
+            return argsort_index, np.array(lookup)[argsort_index]
+
+        def should_avoid_sort(lookup: Any) -> bool:
+            return (
+                not sort_lookups_and_item
+                or (
+                    isinstance(lookup, (range, pandas.RangeIndex, slice))
+                    and lookup.step is not None
+                    and lookup.step > 0
+                )
+                or (isinstance(lookup, slice) and lookup == slice(None))
+            )
+
+        # Fast path to avoid sorting for range/RangeIndex, which are already sorted, or the empty slice
+        avoid_row_lookup_sort = should_avoid_sort(row_lookup)
+        avoid_col_lookup_sort = should_avoid_sort(col_lookup)
+        # Sort both the columns and rows if necessary
+        if item.ndim >= 2:
+            if avoid_row_lookup_sort:
+                if not avoid_col_lookup_sort:
+                    col_argsort, col_lookup = sort_index(col_lookup)
+                    item = item[:, col_argsort]
+            elif avoid_col_lookup_sort:
+                row_argsort, row_lookup = sort_index(row_lookup)
+                item = item[row_argsort, :]
+            else:
+                row_argsort, row_lookup = sort_index(row_lookup)
+                col_argsort, col_lookup = sort_index(col_lookup)
+                # Use np.ix_ to handle broadcasting errors
+                item = item[np.ix_(row_argsort, col_argsort)]
+        elif not avoid_row_lookup_sort:
+            # Item is 1D, so only sort row indexer
+            row_argsort, row_lookup = sort_index(row_lookup)
+            item = item[row_argsort]
         if dtypes is None:
             dtypes = pandas.Series([item.dtype] * len(col_lookup))
         if np.prod(to_shape) == np.prod(item.shape):
-            return item.reshape(to_shape), dtypes
+            return item.reshape(to_shape), dtypes, row_lookup, col_lookup
         else:
-            return np.broadcast_to(item, to_shape), dtypes
+            return np.broadcast_to(item, to_shape), dtypes, row_lookup, col_lookup
     except ValueError:
         from_shape = np.array(item).shape
         raise ValueError(

@@ -2693,3 +2693,77 @@ def test_index_of_empty_frame():
 
     assert md_res.empty and pd_res.empty
     df_equals(md_res.index, pd_res.index)
+
+
+# https://github.com/modin-project/modin/issues/7405
+@pytest.mark.parametrize("indexer", ["loc", "iloc"])
+def test_loc_and_iloc_set_order(indexer):
+    rng = np.random.default_rng(seed=0)
+    is_loc = indexer == "loc"
+    data = {"col": rng.integers(0, 100, size=100)}
+    set_count = 20
+    # Pick a bunch of unsorted row indices; may contain repeat values.
+    row_indexer = rng.integers(0, 100, size=set_count)
+    col_indexer = "col" if is_loc else 0
+    set_data = range(100, 100 + set_count)
+    md_df, pd_df = create_test_dfs(data)
+
+    def get_helper(df):
+        if is_loc:
+            return df.loc[row_indexer, col_indexer]
+        else:
+            return df.iloc[row_indexer, col_indexer]
+
+    # First, ensure loc/iloc read succeeds.
+    eval_general(md_df, pd_df, get_helper)
+
+    def set_helper(df):
+        if is_loc:
+            df.loc[row_indexer, col_indexer] = set_data
+        else:
+            df.iloc[row_indexer, col_indexer] = set_data
+
+    # Second, check results of loc/iloc write.
+    eval_general(
+        md_df,
+        pd_df,
+        set_helper,
+        __inplace__=True,
+    )
+    # Finally, check the result of a loc/iloc read again.
+    eval_general(md_df, pd_df, get_helper)
+
+
+def test_iloc_set_negative_index():
+    rng = np.random.default_rng(seed=0)
+    row_count = 50
+    col_count = 80
+    data = {f"col_{i}": rng.integers(0, 100, size=row_count) for i in range(col_count)}
+    row_set_count = 20
+    col_set_count = 30
+    # Pick a bunch of unsorted row indices; may contain repeat values and negative numbers.
+    row_indexer = rng.integers(-row_count, row_count, size=row_set_count)
+    col_indexer = rng.integers(-col_count, col_count, size=col_set_count)
+    set_data = np.reshape(
+        range(100, 100 + row_set_count * col_set_count), (row_set_count, col_set_count)
+    )
+    md_df, pd_df = create_test_dfs(data)
+
+    def get_helper(df):
+        return df.iloc[row_indexer, col_indexer]
+
+    # First, ensure loc/iloc read succeeds.
+    eval_general(md_df, pd_df, get_helper)
+
+    def set_helper(df):
+        df.iloc[row_indexer, col_indexer] = set_data
+
+    # Second, check results of loc/iloc write.
+    eval_general(
+        md_df,
+        pd_df,
+        set_helper,
+        __inplace__=True,
+    )
+    # Finally, check the result of a loc/iloc read again.
+    eval_general(md_df, pd_df, get_helper)