diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index b5e701d2d4b..b107089eda5 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -298,6 +298,7 @@ def register( cls, func: Callable[..., pandas.DataFrame], join_type: str = "outer", + sort: bool = None, labels: str = "replace", infer_dtypes: Optional[str] = None, ) -> Callable[..., PandasQueryCompiler]: @@ -310,6 +311,8 @@ def register( Binary function to execute. Have to be able to accept at least two arguments. join_type : {'left', 'right', 'outer', 'inner', None}, default: 'outer' Type of join that will be used if indices of operands are not aligned. + sort : bool, default: None + Whether to sort index and columns or not. labels : {"keep", "replace", "drop"}, default: "replace" Whether keep labels from left Modin DataFrame, replace them with labels from joined DataFrame or drop altogether to make them be computed lazily later. @@ -419,6 +422,7 @@ def caller( lambda x, y: func(x, y, *args, **kwargs), [other._modin_frame], join_type=join_type, + sort=sort, labels=labels, dtypes=dtypes, ), diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index b8467ebe084..5456f28f127 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -3255,7 +3255,6 @@ def broadcast_apply( axis, other, join_type, - sort=not self.get_axis(axis).equals(other.get_axis(axis)), ) # unwrap list returned by `copartition`. right_parts = right_parts[0] @@ -3681,7 +3680,7 @@ def _check_if_axes_identical(self, other: PandasDataframe, axis: int = 0) -> boo ) and self._get_axis_lengths(axis) == other._get_axis_lengths(axis) def _copartition( - self, axis, other, how, sort, force_repartition=False, fill_value=None + self, axis, other, how, sort=None, force_repartition=False, fill_value=None ): """ Copartition two Modin DataFrames. @@ -3696,8 +3695,9 @@ def _copartition( Other Modin DataFrame(s) to copartition against. how : str How to manage joining the index object ("left", "right", etc.). - sort : bool + sort : bool, default: None Whether sort the joined index or not. + If ``None``, sort is defined in depend on labels equality along the axis. force_repartition : bool, default: False Whether force the repartitioning or not. By default, this method will skip repartitioning if it is possible. This is because @@ -3730,6 +3730,9 @@ def _copartition( self._get_axis_lengths_cache(axis), ) + if sort is None: + sort = not all(self.get_axis(axis).equals(o.get_axis(axis)) for o in other) + self_index = self.get_axis(axis) others_index = [o.get_axis(axis) for o in other] joined_index, make_reindexer = self._join_index_objects( @@ -3823,6 +3826,7 @@ def n_ary_op( op, right_frames: list[PandasDataframe], join_type="outer", + sort=None, copartition_along_columns=True, labels="replace", dtypes: Optional[pandas.Series] = None, @@ -3838,6 +3842,8 @@ def n_ary_op( Modin DataFrames to join with. join_type : str, default: "outer" Type of join to apply. + sort : bool, default: None + Whether to sort index and columns or not. copartition_along_columns : bool, default: True Whether to perform copartitioning along columns or not. For some ops this isn't needed (e.g., `fillna`). @@ -3854,7 +3860,10 @@ def n_ary_op( New Modin DataFrame. """ left_parts, list_of_right_parts, joined_index, row_lengths = self._copartition( - 0, right_frames, join_type, sort=True + 0, + right_frames, + join_type, + sort=sort, ) if copartition_along_columns: new_left_frame = self.__constructor__( @@ -3886,7 +3895,7 @@ def n_ary_op( 1, new_right_frames, join_type, - sort=True, + sort=sort, ) else: joined_columns = self.copy_columns_cache(copy_lengths=True) @@ -3978,7 +3987,7 @@ def _compute_new_widths(): joined_index, partition_sizes_along_axis, ) = self._copartition( - axis.value ^ 1, others, how, sort, force_repartition=False + axis.value ^ 1, others, how, sort=sort, force_repartition=False ) if axis == Axis.COL_WISE: new_lengths = partition_sizes_along_axis diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 7c4f7e79f55..9d4467c2085 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -460,6 +460,7 @@ def to_numpy(self, **kwargs): df_update = Binary.register( copy_df_for_func(pandas.DataFrame.update, display_name="update"), join_type="left", + sort=False, ) series_update = Binary.register( copy_df_for_func( @@ -467,6 +468,7 @@ def to_numpy(self, **kwargs): display_name="update", ), join_type="left", + sort=False, ) # Needed for numpy API diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index e153f9f892f..108e2620aac 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -527,3 +527,30 @@ def test_arithmetic_with_tricky_dtypes(val1, val2, op, request): lambda dfs: getattr(dfs[0], op)(dfs[1]), expected_exception=expected_exception, ) + + +@pytest.mark.parametrize( + "data, other_data", + [ + ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}), + ({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}), + ], +) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("match_index", [True, False]) +def test_bin_op_mismatched_columns(data, other_data, axis, match_index): + modin_df, pandas_df = create_test_dfs(data) + other_modin_df, other_pandas_df = create_test_dfs(other_data) + if axis == 0: + if not match_index: + modin_df.index = pandas_df.index = ["1", "2", "3"] + other_modin_df.index = other_pandas_df.index = ["2", "1", "3"] + eval_general( + modin_df, + pandas_df, + lambda df: ( + df.add(other_modin_df, axis=axis) + if isinstance(df, pd.DataFrame) + else df.add(other_pandas_df, axis=axis) + ), + ) diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index 4b19d5fbd9d..d6980cd6761 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -1592,6 +1592,7 @@ def test_transpose(data): "data, other_data", [ ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}), + ({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}), ( {"A": ["a", "b", "c"], "B": ["x", "y", "z"]}, {"B": ["d", "e", "f", "g", "h", "i"]},