From dd6b101d852d6cd3de6f3892609a985b4ade75c5 Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Mon, 29 Apr 2024 14:18:01 -0500 Subject: [PATCH 01/19] FEAT-modin-project#4605: Add small query compiler --- .github/workflows/ci.yml | 52 + modin/config/__init__.py | 2 + modin/config/envvars.py | 5 + .../pandas/small_query_compiler.py | 1054 +++++++++++++++++ modin/pandas/base.py | 5 + modin/pandas/dataframe.py | 31 +- modin/pandas/io.py | 8 +- modin/pandas/series.py | 46 +- modin/pandas/utils.py | 6 +- modin/tests/pandas/dataframe/test_binary.py | 6 +- modin/tests/pandas/dataframe/test_default.py | 11 +- modin/tests/pandas/dataframe/test_indexing.py | 6 +- modin/utils.py | 20 +- 13 files changed, 1223 insertions(+), 29 deletions(-) create mode 100644 modin/experimental/core/storage_formats/pandas/small_query_compiler.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 68ea8eaac3e..c5e0b02b460 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -220,6 +220,10 @@ jobs: id: filter with: filters: | + test-small-query-compiler: + - 'modin/experimental/core/storage_formats/pandas/small_query_compiler.py' + - 'modin/core/storage_formats/pandas/query_compiler.py' + - 'modin/core/storage_formats/base/query_compiler.py' shared: &shared - 'modin/core/execution/dispatching/**' ray: @@ -631,6 +635,54 @@ jobs: python-version: ${{matrix.python-version}} - run: python -m pytest modin/tests/experimental/spreadsheet/test_general.py + test-small-query-compiler: + needs: [changes, lint-flake8, lint-black, test-api, test-headers] + if: ${{ needs.changes.outputs.test-small-query-compiler == 'true' }} + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + strategy: + matrix: + python-version: ["3.9"] + env: + MODIN_SMALL_QUERY_COMPILER: "True" + name: test-small-query-compiler python ${{matrix.python-version}}) + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/mamba-env + with: + environment-file: environment-dev.yml + python-version: ${{matrix.python-version}} + - run: python -m pytest modin/tests/config/test_envvars.py + - run: python -m pytest modin/tests/config/test_parameter.py + - run: python -m pytest modin/tests/pandas/dataframe/test_binary.py + - run: python -m pytest modin/tests/pandas/dataframe/test_default.py + - run: python -m pytest modin/tests/pandas/dataframe/test_indexing.py + - run: python -m pytest modin/tests/pandas/dataframe/test_iter.py + - run: python -m pytest modin/tests/pandas/dataframe/test_join_sort.py + - run: python -m pytest modin/tests/pandas/dataframe/test_map_metadata.py + - run: python -m pytest modin/tests/pandas/dataframe/test_pickle.py + - run: python -m pytest modin/tests/pandas/dataframe/test_reduce.py + - run: python -m pytest modin/tests/pandas/dataframe/test_udf.py + - run: python -m pytest modin/tests/pandas/dataframe/test_window.py + - run: python -m pytest modin/tests/pandas/extensions/test_dataframe_extensions.py + - run: python -m pytest modin/tests/pandas/extensions/test_pd_extensions.py + - run: python -m pytest modin/tests/pandas/extensions/test_series_extensions.py + - run: python -m pytest modin/tests/pandas/integrations/test_lazy_import.py + - run: python -m pytest modin/tests/pandas/internals/test_benchmark_mode.py + - run: python -m pytest modin/tests/pandas/internals/test_repartition.py + - run: python -m pytest modin/tests/pandas/test_api.py + - run: python -m pytest modin/tests/pandas/test_concat.py + - run: python -m pytest modin/tests/pandas/test_expanding.py + - run: python -m pytest modin/tests/pandas/test_general.py + - run: python -m pytest modin/tests/pandas/test_groupby.py + - run: python -m pytest modin/tests/pandas/test_io.py + - run: python -m pytest modin/tests/pandas/test_reshape.py + - run: python -m pytest modin/tests/pandas/test_rolling.py + - run: python -m pytest modin/tests/pandas/test_series.py + - uses: codecov/codecov-action@v2 + merge-coverage-artifacts: needs: [test-internals, test-api-and-no-engine, test-defaults, test-all-unidist, test-all, test-experimental, test-sanity] if: always() # we need to run it regardless of some job being skipped, like in PR diff --git a/modin/config/__init__.py b/modin/config/__init__.py index cf5f7895c5d..4e6cf88ddd3 100644 --- a/modin/config/__init__.py +++ b/modin/config/__init__.py @@ -53,6 +53,7 @@ TestReadFromPostgres, TestReadFromSqlServer, TrackFileLeaks, + InitializeWithSmallQueryCompilers ) from modin.config.pubsub import Parameter, ValueSource, context @@ -68,6 +69,7 @@ "CpuCount", "GpuCount", "Memory", + "PersistentPickle" # Ray specific "IsRayCluster", "RayRedisAddress", diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 8654ebe30c1..9b045992417 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -911,6 +911,11 @@ def _check_vars() -> None: deprecated[depr_var].deprecation_message(use_envvar_names=True), FutureWarning, ) +class InitializeWithSmallQueryCompilers(EnvironmentVariable, type=str): + """Set to true to use implementation of SmallQueryCompiler.""" + + varname = "MODIN_SMALL_QUERY_COMPILER" + default = False _check_vars() diff --git a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py new file mode 100644 index 00000000000..f92e9b4f9b9 --- /dev/null +++ b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py @@ -0,0 +1,1054 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +""" +Module contains ``SmallQueryCompiler`` class. + +``SmallQueryCompiler`` is responsible for compiling efficient DataFrame algebra +queries for small data and empty ``PandasDataFrame``. +""" + +from modin.config.envvars import InitializeWithSmallQueryCompilers +import numpy as np +import pandas +from pandas.core.dtypes.common import ( + is_list_like, + is_scalar, +) + +from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler +from modin.utils import MODIN_UNNAMED_SERIES_LABEL +from modin.utils import ( + _inherit_docstrings, + try_cast_to_pandas, +) + + +def _get_axis(axis): + """ + Build index labels getter of the specified axis. + + Parameters + ---------- + axis : {0, 1} + Axis to get labels from. 0 is for index and 1 is for column. + + Returns + ------- + callable(PandasQueryCompiler) -> pandas.Index + """ + if axis == 0: + return lambda self: self._pandas_frame.index + else: + return lambda self: self._pandas_frame.columns + + +def _set_axis(axis): + """ + Build index labels setter of the specified axis. + + Parameters + ---------- + axis : {0, 1} + Axis to set labels on. 0 is for index and 1 is for column. + + Returns + ------- + callable(PandasQueryCompiler) + """ + if axis == 0: + + def set_axis(self, idx): + self._pandas_frame.index = idx + + else: + + def set_axis(self, cols): + self._pandas_frame.columns = cols + + return set_axis + + +def _str_map(func_name): + """ + Build function that calls specified string function on frames ``str`` accessor. + + Parameters + ---------- + func_name : str + String function name to execute on ``str`` accessor. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + """ + + def str_op_builder(df, *args, **kwargs): + """Apply specified function against `str` accessor of the passed frame.""" + str_s = df.squeeze(axis=1).str + return getattr(pandas.Series.str, func_name)(str_s, *args, **kwargs).to_frame() + + return str_op_builder + + +def _dt_prop_map(property_name): + """ + Build function that access specified property of the ``dt`` property of the passed frame. + + Parameters + ---------- + property_name : str + Date-time property name to access. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied in the frame. + + Notes + ----- + This applies non-callable properties of ``Series.dt``. + """ + + def dt_op_builder(df, *args, **kwargs): + """Access specified date-time property of the passed frame.""" + prop_val = getattr(df.squeeze(axis=1).dt, property_name) + if isinstance(prop_val, pandas.Series): + return prop_val.to_frame() + elif isinstance(prop_val, pandas.DataFrame): + return prop_val + else: + return pandas.DataFrame([prop_val]) + + return dt_op_builder + + +def _dt_func_map(func_name): + """ + Build function that apply specified method against ``dt`` property of the passed frame. + + Parameters + ---------- + func_name : str + Date-time function name to apply. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied in the frame. + + Notes + ----- + This applies callable methods of ``Series.dt``. + """ + + def dt_op_builder(df, *args, **kwargs): + """Apply specified function against ``dt`` accessor of the passed frame.""" + dt_s = df.squeeze(axis=1).dt + dt_func_result = getattr(pandas.Series.dt, func_name)(dt_s, *args, **kwargs) + # If we don't specify the dtype for the frame, the frame might get the + # wrong dtype, e.g. for to_pydatetime in https://github.com/modin-project/modin/issues/4436 + return pandas.DataFrame(dt_func_result, dtype=dt_func_result.dtype) + + return dt_op_builder + + +def _rolling_func(func): + """ + Build function that apply specified rolling method of the passed frame. + + Parameters + ---------- + func : str + Rolling function name to apply. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + + def rolling_builder(df, fold_axis, rolling_args, *args, **kwargs): + rolling_result = df.rolling(*rolling_args) + rolling_op = getattr(rolling_result, func) + return rolling_op(*args, **kwargs) + + return rolling_builder + + +def _reindex(df, axis, labels, **kwargs): # noqa: GL08 + return df.reindex(labels=labels, axis=axis, **kwargs) + + +def _concat(df, axis, other, join_axes=None, **kwargs): # noqa: GL08 + if not isinstance(other, list): + other = [other] + if ( + isinstance(df, pandas.DataFrame) + and len(df.columns) == 1 + and df.columns[0] == MODIN_UNNAMED_SERIES_LABEL + ): + df = df[df.columns[0]] + + ignore_index = kwargs.get("ignore_index", False) + concat_join = ["outer", "inner"] + if kwargs.get("join", "outer") in concat_join: + if not isinstance(other, list): + other = [other] + other = [df] + other + result = pandas.concat(other, axis=axis, **kwargs) + else: + if isinstance(other, (list, np.ndarray)) and len(other) == 1: + other = other[0] + ignore_index = kwargs.pop("ignore_index", None) + kwargs["how"] = kwargs.pop("join", None) + if isinstance(other, (pandas.DataFrame, pandas.Series)): + result = df.join(other, rsuffix="r_", **kwargs) + else: + result = df.join(other, **kwargs) + if ignore_index: + if axis == 0: + result = result.reset_index(drop=True) + else: + result.columns = pandas.RangeIndex(len(result.columns)) + return result + + +def _to_datetime(df, *args, **kwargs): # noqa: GL08 + return pandas.to_datetime(df.squeeze(axis=1), *args, **kwargs) + + +def _to_numeric(df, *args, **kwargs): # noqa: GL08 + return pandas.to_numeric(df.squeeze(axis=1), *args, **kwargs) + + +def _groupby(agg_name): + """ + Build function that apply specified groupby method of the passed frame. + + Parameters + ---------- + agg_name : str + GroupBy aggregate function name to apply. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + __aggregation_methods_dict = { + "axis_wise": pandas.core.groupby.DataFrameGroupBy.aggregate, + "group_wise": pandas.core.groupby.DataFrameGroupBy.apply, + "transform": pandas.core.groupby.DataFrameGroupBy.transform, + } + + def groupby_callable( + df, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + agg_func=None, + how="axis_wise", + drop=False, + **kwargs + ): + by_names = [] + if isinstance(by, pandas.DataFrame): + by = by.squeeze(axis=1) + if isinstance(by, list): + for i in range(len(by)): + if isinstance(by[i], pandas.DataFrame): + by[i] = by[i].squeeze(axis=1) + if isinstance(by[i], pandas.Series): + if isinstance(df.index, pandas.MultiIndex): + by[i].name = pandas.MultiIndex.from_tuples(by[i].name) + by_names.append(by[i].name) + elif isinstance(by[i], str): + by_names.append(by[i]) + if isinstance(by, pandas.DataFrame): + by_names = list(by.columns) + to_append = by.columns[[name not in df.columns for name in by_names]] + if len(to_append) > 0: + df = pandas.concat([df, by[to_append]], axis=1) + by = by_names + if isinstance(by, pandas.Series) and drop: + by_names = [by.name] + if ( + is_list_like(by) + and drop + and not any([is_list_like(curr_by) for curr_by in by]) + ): + by = by_names + + groupby_obj = df.groupby(by=by, axis=axis, **groupby_kwargs) + if agg_name == "agg": + if isinstance(agg_func, dict): + # Related to pandas issue when dict with list of funcs as value is passed in agg_func + # https://github.com/pandas-dev/pandas/issues/39103 + agg_func = { + k: v[0] if isinstance(v, list) and len(v) == 1 else v + for k, v in agg_func.items() + } + groupby_agg = __aggregation_methods_dict[how] + result = groupby_agg(groupby_obj, agg_func, *agg_args, **agg_kwargs) + else: + groupby_agg = getattr(groupby_obj, agg_name) + if callable(groupby_agg): + result = groupby_agg(*agg_args, **agg_kwargs) + else: + result = groupby_agg + + return result + + return groupby_callable + + +def _take_2d(df, index=None, columns=None): # noqa: GL08 + columns = columns if columns is not None else slice(None) + index = index if index is not None else slice(None) + return df.iloc[index, columns] + + +def _register_binary(op): + """ + Build function that apply specified binary method of the passed frame. + + Parameters + ---------- + op : str + Binary function name to apply. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + + def binary_operator(df, other, **kwargs): + if isinstance(other, pandas.DataFrame) and ( + not df.empty + or ( + len(other.columns) == 1 + and other.columns[0] == MODIN_UNNAMED_SERIES_LABEL + ) + ): + other = other.squeeze() + return getattr(df, op)(other, **kwargs) + + return binary_operator + + +def _register_resample(op): + """ + Build function that apply specified resample method of the passed frame. + + Parameters + ---------- + op : str + Resample function name to apply. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + + def resample_operator(df, resample_kwargs, *args, **kwargs): + resampler = df.resample(**resample_kwargs) + result = getattr(resampler, op)(*args, **kwargs) + return result + + return resample_operator + + +def _drop(df, **kwargs): # noqa: GL08 + if ( + kwargs.get("labels", None) is not None + or kwargs.get("index", None) is not None + or kwargs.get("columns", None) is not None + ): + return df.drop(**kwargs) + return df + + +def _fillna(df, squeeze_self=True, squeeze_value=False, **kwargs): # noqa: GL08 + if len(df.columns) == 1 and df.columns[0] == "__reduced__": + df = df["__reduced__"] + return df.fillna(**kwargs) + + +def _is_monotonic(monotonic_type): # noqa: GL08 + def is_monotonic_caller(ser): + return pandas.DataFrame([getattr(ser, monotonic_type)]) + + return is_monotonic_caller + + +def _sort_index(df, inplace=False, **kwargs): # noqa: GL08 + if inplace: + df.sort_index(inplace=inplace, **kwargs) + else: + df = df.sort_index(inplace=inplace, **kwargs) + return df + + +def _combine(df, other, func, **kwargs): # noqa: GL08 + if isinstance(df, pandas.Series): + return func(df, other) + return df.combine(other, func) + + +def _getitem_array(df, key): # noqa: GL08 + if isinstance(key, pandas.DataFrame): + key = key.squeeze(axis=1) + return df[key] + + +def _getitem_row_array(df, key): # noqa: GL08 + if isinstance(key, pandas.DataFrame): + key = key.squeeze(axis=1) + return df.iloc[key] + + +def _write_items( + df, row_numeric_index, col_numeric_index, broadcasted_items +): # noqa: GL08 + if not isinstance(row_numeric_index, slice): + row_numeric_index = list(row_numeric_index) + if not isinstance(col_numeric_index, slice): + col_numeric_index = list(col_numeric_index) + + if isinstance(df.iloc[row_numeric_index, col_numeric_index], pandas.Series): + broadcasted_items = broadcasted_items.squeeze() + df.iloc[row_numeric_index, col_numeric_index] = broadcasted_items + return df + + +def _setitem(df, axis, key, value): # noqa: GL08 + if is_scalar(key) and isinstance(value, pandas.DataFrame): + value = value.squeeze() + if not axis: + df[key] = value + else: + df.loc[key] = value + return df + + +def _delitem(df, key): # noqa: GL08 + return df.drop(columns=[key]) + + +def _get_dummies(df, columns, **kwargs): # noqa: GL08 + return pandas.get_dummies(df, columns=columns, **kwargs) + + +@_inherit_docstrings(BaseQueryCompiler) +class SmallQueryCompiler(BaseQueryCompiler): + """ + Query compiler for the pandas storage format. + + This class translates common query compiler API to default all methods + to pandas. + + Parameters + ---------- + pandas_frame : pandas.DataFrame + Modin Frame to query with the compiled queries. + """ + + def __init__(self, pandas_frame): + assert InitializeWithSmallQueryCompilers.get() + if hasattr(pandas_frame, "_to_pandas"): + pandas_frame = pandas_frame._to_pandas() + if is_scalar(pandas_frame): + pandas_frame = pandas.DataFrame([pandas_frame]) + elif not isinstance(pandas_frame, pandas.DataFrame): + pandas_frame = pandas.DataFrame(pandas_frame) + + self._pandas_frame = pandas_frame + + def default_to_pandas(self, pandas_op, *args, **kwargs): + args = (a.to_pandas() if isinstance(a, type(self)) else a for a in args) + kwargs = { + k: v.to_pandas if isinstance(v, type(self)) else v + for k, v in kwargs.items() + } + + result = pandas_op(self._pandas_frame, *args, **kwargs) + if isinstance(result, pandas.Series): + if result.name is None: + result.name = MODIN_UNNAMED_SERIES_LABEL + result = result.to_frame() + + return result + + def execute(self): + """Wait for all computations to complete without materializing data.""" + pass + + + + def _register_default_pandas( + func, + is_series=False, + squeeze_series=False, + squeeze_args=False, + squeeze_kwargs=False, + return_modin=True, + in_place=False, + df_copy=False, + filter_kwargs=[], + ): + """ + Build function that apply specified method of the passed frame. + + Parameters + ---------- + func : callable + Function to apply. + is_series : bool, default: False + If True, the passed frame will always be squeezed to a series. + squeeze_series : bool, default: False + If True, the passed frame will always be squeezed to a series if there is a single column named "__reduced__". + squeeze_args : bool, default: False + If True, all passed arguments will be squeezed. + squeeze_kwargs : bool, default: False + If True, all passed key word arguments will be squeezed. + return_modin : bool, default: True + If True, the result will always try to convert to DataFrame or Series. + in_place : bool, default: False + If True, the specified function will be applied on the passed frame in place. + df_copy : bool, default: False + If True, the specified function will be applied to a copy of the passed frame. + filter_kwargs : list, default: [] + List of key word argument names to remove. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + + def caller(query_compiler, *args, **kwargs): + df = query_compiler._pandas_frame + if df_copy: + df = df.copy() + if is_series: + df = df.squeeze(axis=1) + elif ( + squeeze_series + and len(df.columns) == 1 + and df.columns[0] == MODIN_UNNAMED_SERIES_LABEL + ): + df = df.squeeze(axis=1) + exclude_names = [ + "broadcast", + "fold_axis", + "squeeze_self", + "squeeze_value", + "ignore_indices" + ] + filter_kwargs + kwargs = kwargs.copy() + for name in exclude_names: + kwargs.pop(name, None) + args = try_cast_to_pandas(args, squeeze=squeeze_args, squeeze_df=True) + kwargs = try_cast_to_pandas(kwargs, squeeze=squeeze_kwargs, squeeze_df=True) + result = func(df, *args, **kwargs) + if in_place: + result = df + if not ( + return_modin or isinstance(result, (pandas.Series, pandas.DataFrame)) + ): + return result + if isinstance(result, pandas.Series): + if result.name is None: + result.name = MODIN_UNNAMED_SERIES_LABEL + result = result.to_frame() + return query_compiler.__constructor__(result) + + return caller + + __and__ = _register_default_pandas(pandas.DataFrame.__and__, squeeze_series=True) + __dir__ = _register_default_pandas(pandas.DataFrame.__dir__) + __eq__ = _register_default_pandas(pandas.DataFrame.__eq__, squeeze_series=True) + __format__ = _register_default_pandas(pandas.DataFrame.__format__) + __ge__ = _register_default_pandas(pandas.DataFrame.__ge__, squeeze_series=True) + __gt__ = _register_default_pandas(pandas.DataFrame.__gt__, squeeze_series=True) + __le__ = _register_default_pandas(pandas.DataFrame.__le__, squeeze_series=True) + __lt__ = _register_default_pandas(pandas.DataFrame.__lt__, squeeze_series=True) + __ne__ = _register_default_pandas(pandas.DataFrame.__ne__, squeeze_series=True) + __or__ = _register_default_pandas(pandas.DataFrame.__or__, squeeze_series=True) + __rand__ = _register_default_pandas(pandas.DataFrame.__rand__, squeeze_series=True) + __reduce__ = _register_default_pandas( + pandas.DataFrame.__reduce__, return_modin=False + ) + __reduce_ex__ = _register_default_pandas( + pandas.DataFrame.__reduce_ex__, return_modin=False + ) + __ror__ = _register_default_pandas(pandas.DataFrame.__ror__, squeeze_series=True) + __rxor__ = _register_default_pandas(pandas.DataFrame.__rxor__, squeeze_series=True) + __sizeof__ = _register_default_pandas(pandas.DataFrame.__sizeof__) + __xor__ = _register_default_pandas(pandas.DataFrame.__xor__, squeeze_series=True) + abs = _register_default_pandas(pandas.DataFrame.abs) + add = _register_default_pandas(_register_binary("add")) + all = _register_default_pandas(pandas.DataFrame.all) + any = _register_default_pandas(pandas.DataFrame.any) + apply = _register_default_pandas(pandas.DataFrame.apply) + apply_on_series = _register_default_pandas(pandas.Series.apply, is_series=True) + applymap = _register_default_pandas(pandas.DataFrame.applymap) + astype = _register_default_pandas(pandas.DataFrame.astype) + cat_codes = _register_default_pandas(lambda ser: ser.cat.codes, is_series=True) + clip = _register_default_pandas(pandas.DataFrame.clip) + combine = _register_default_pandas(_combine, squeeze_series=True) + combine_first = _register_default_pandas( + lambda df, other: df.combine_first(other), squeeze_series=True + ) + compare = _register_default_pandas(pandas.DataFrame.compare) + concat = _register_default_pandas(_concat) + conj = _register_default_pandas( + lambda df, *args, **kwargs: pandas.DataFrame(np.conj(df)) + ) + convert_dtypes = _register_default_pandas(pandas.DataFrame.convert_dtypes) + copy = _register_default_pandas(pandas.DataFrame.copy) + count = _register_default_pandas(pandas.DataFrame.count) + corr = _register_default_pandas(pandas.DataFrame.corr) + cov = _register_default_pandas(pandas.DataFrame.cov) + cummax = _register_default_pandas(pandas.DataFrame.cummax) + cummin = _register_default_pandas(pandas.DataFrame.cummin) + cumprod = _register_default_pandas(pandas.DataFrame.cumprod) + cumsum = _register_default_pandas(pandas.DataFrame.cumsum) + delitem = _register_default_pandas(_delitem) + describe = _register_default_pandas(pandas.DataFrame.describe) + df_update = _register_default_pandas( + pandas.DataFrame.update, in_place=True, df_copy=True + ) + diff = _register_default_pandas(pandas.DataFrame.diff) + drop = _register_default_pandas(_drop) + dropna = _register_default_pandas(pandas.DataFrame.dropna) # axis values switched? + dt_ceil = _register_default_pandas(_dt_func_map("ceil")) + dt_components = _register_default_pandas(_dt_prop_map("components")) + dt_date = _register_default_pandas(_dt_prop_map("date")) + dt_day = _register_default_pandas(_dt_prop_map("day")) + dt_day_name = _register_default_pandas(_dt_func_map("day_name")) + dt_dayofweek = _register_default_pandas(_dt_prop_map("dayofweek")) + dt_dayofyear = _register_default_pandas(_dt_prop_map("dayofyear")) + dt_days = _register_default_pandas(_dt_prop_map("days")) + dt_days_in_month = _register_default_pandas(_dt_prop_map("days_in_month")) + dt_daysinmonth = _register_default_pandas(_dt_prop_map("daysinmonth")) + dt_end_time = _register_default_pandas(_dt_prop_map("end_time")) + dt_floor = _register_default_pandas(_dt_func_map("floor")) + dt_freq = _register_default_pandas( + lambda df: pandas.DataFrame([df.squeeze(axis=1).dt.freq]) + ) + dt_hour = _register_default_pandas(_dt_prop_map("hour")) + dt_is_leap_year = _register_default_pandas(_dt_prop_map("is_leap_year")) + dt_is_month_end = _register_default_pandas(_dt_prop_map("is_month_end")) + dt_is_month_start = _register_default_pandas(_dt_prop_map("is_month_start")) + dt_is_quarter_end = _register_default_pandas(_dt_prop_map("is_quarter_end")) + dt_is_quarter_start = _register_default_pandas(_dt_prop_map("is_quarter_start")) + dt_is_year_end = _register_default_pandas(_dt_prop_map("is_year_end")) + dt_is_year_start = _register_default_pandas(_dt_prop_map("is_year_start")) + dt_microsecond = _register_default_pandas(_dt_prop_map("microsecond")) + dt_microseconds = _register_default_pandas(_dt_prop_map("microseconds")) + dt_minute = _register_default_pandas(_dt_prop_map("minute")) + dt_month = _register_default_pandas(_dt_prop_map("month")) + dt_month_name = _register_default_pandas(_dt_func_map("month_name")) + dt_nanosecond = _register_default_pandas(_dt_prop_map("nanosecond")) + dt_nanoseconds = _register_default_pandas(_dt_prop_map("nanoseconds")) + dt_normalize = _register_default_pandas(_dt_func_map("normalize")) + dt_quarter = _register_default_pandas(_dt_prop_map("quarter")) + dt_qyear = _register_default_pandas(_dt_prop_map("qyear")) + dt_round = _register_default_pandas(_dt_func_map("round")) + dt_second = _register_default_pandas(_dt_prop_map("second")) + dt_seconds = _register_default_pandas(_dt_prop_map("seconds")) + dt_start_time = _register_default_pandas(_dt_prop_map("start_time")) + dt_strftime = _register_default_pandas(_dt_func_map("strftime")) + dt_time = _register_default_pandas(_dt_prop_map("time")) + dt_timetz = _register_default_pandas(_dt_prop_map("timetz")) + dt_to_period = _register_default_pandas(_dt_func_map("to_period")) + dt_to_pydatetime = _register_default_pandas(_dt_func_map("to_pydatetime")) + dt_to_pytimedelta = _register_default_pandas(_dt_func_map("to_pytimedelta")) + dt_to_timestamp = _register_default_pandas(_dt_func_map("to_timestamp")) + dt_total_seconds = _register_default_pandas(_dt_func_map("total_seconds")) + dt_tz = _register_default_pandas( + lambda df: pandas.DataFrame([df.squeeze(axis=1).dt.tz]) + ) + dt_tz_convert = _register_default_pandas(_dt_func_map("tz_convert")) + dt_tz_localize = _register_default_pandas(_dt_func_map("tz_localize")) + dt_week = _register_default_pandas(_dt_prop_map("week")) + dt_weekday = _register_default_pandas(_dt_prop_map("weekday")) + dt_weekofyear = _register_default_pandas(_dt_prop_map("weekofyear")) + dt_year = _register_default_pandas(_dt_prop_map("year")) + eq = _register_default_pandas(_register_binary("eq"), filter_kwargs=["dtypes"]) + eval = _register_default_pandas(pandas.DataFrame.eval) + explode = _register_default_pandas(pandas.DataFrame.explode) + fillna = _register_default_pandas(_fillna) + first_valid_index = _register_default_pandas( + pandas.DataFrame.first_valid_index, return_modin=False + ) + floordiv = _register_default_pandas(_register_binary("floordiv")) + ge = _register_default_pandas(pandas.DataFrame.ge, filter_kwargs=["dtypes"]) + get_dummies = _register_default_pandas(_get_dummies) + getitem_array = _register_default_pandas(_getitem_array) + getitem_row_array = _register_default_pandas(_getitem_row_array) + groupby_agg = _register_default_pandas(_groupby("agg")) + groupby_all = _register_default_pandas(_groupby("all")) + groupby_any = _register_default_pandas(_groupby("any")) + groupby_count = _register_default_pandas(_groupby("count")) + groupby_cummax = _register_default_pandas(_groupby("cummax")) + groupby_cummin = _register_default_pandas(_groupby("cummin")) + groupby_cumprod = _register_default_pandas(_groupby("cumprod")) + groupby_cumsum = _register_default_pandas(_groupby("cumsum")) + groupby_dtypes = _register_default_pandas(_groupby("dtypes")) + groupby_fillna = _register_default_pandas(_groupby("fillna")) + groupby_max = _register_default_pandas(_groupby("max")) + groupby_mean = _register_default_pandas(_groupby("mean")) + groupby_median = _register_default_pandas(_groupby("median")) + groupby_min = _register_default_pandas(_groupby("min")) + groupby_nunique = _register_default_pandas(_groupby("nunique")) + groupby_prod = _register_default_pandas(_groupby("prod")) + groupby_quantile = _register_default_pandas(_groupby("quantile")) + groupby_rank = _register_default_pandas(_groupby("rank")) + groupby_shift = _register_default_pandas(_groupby("shift")) + groupby_size = _register_default_pandas(_groupby("size")) + groupby_skew = _register_default_pandas(_groupby("skew")) + groupby_std = _register_default_pandas(_groupby("std")) + groupby_sum = _register_default_pandas(_groupby("sum")) + groupby_var = _register_default_pandas(_groupby("var")) + gt = _register_default_pandas(pandas.DataFrame.gt, filter_kwargs=["dtypes"]) + idxmax = _register_default_pandas(pandas.DataFrame.idxmax) + idxmin = _register_default_pandas(pandas.DataFrame.idxmin) + infer_objects = _register_default_pandas( + pandas.DataFrame.infer_objects, return_modin=False + ) + insert = _register_default_pandas( + pandas.DataFrame.insert, in_place=True, squeeze_args=True + ) + invert = _register_default_pandas(pandas.DataFrame.__invert__) + is_monotonic = _register_default_pandas( + _is_monotonic("is_monotonic"), is_series=True + ) + is_monotonic_decreasing = _register_default_pandas( + _is_monotonic("is_monotonic_decreasing"), is_series=True + ) + is_monotonic_increasing = _register_default_pandas( + _is_monotonic("is_monotonic_increasing"), is_series=True + ) + isin = _register_default_pandas(pandas.DataFrame.isin) + isna = _register_default_pandas(pandas.DataFrame.isna) + join = _register_default_pandas(pandas.DataFrame.join) + kurt = _register_default_pandas(pandas.DataFrame.kurt) + last_valid_index = _register_default_pandas( + pandas.DataFrame.last_valid_index, return_modin=False + ) + le = _register_default_pandas(pandas.DataFrame.le, filter_kwargs=["dtypes"]) + lt = _register_default_pandas(pandas.DataFrame.lt, filter_kwargs=["dtypes"]) + #mad = _register_default_pandas(pandas.DataFrame.mad) + max = _register_default_pandas(pandas.DataFrame.max) + mean = _register_default_pandas(pandas.DataFrame.mean) + median = _register_default_pandas(pandas.DataFrame.median) + melt = _register_default_pandas(pandas.DataFrame.melt) + memory_usage = _register_default_pandas(pandas.DataFrame.memory_usage) + merge = _register_default_pandas(pandas.DataFrame.merge) + min = _register_default_pandas(pandas.DataFrame.min) + mod = _register_default_pandas(_register_binary("mod")) + mode = _register_default_pandas(pandas.DataFrame.mode) + mul = _register_default_pandas(_register_binary("mul")) + ne = _register_default_pandas(pandas.DataFrame.ne, filter_kwargs=["dtypes"]) + negative = _register_default_pandas(pandas.DataFrame.__neg__) + nlargest = _register_default_pandas(pandas.DataFrame.nlargest) + notna = _register_default_pandas(pandas.DataFrame.notna) + nsmallest = _register_default_pandas( + lambda df, **kwargs: df.nsmallest(**kwargs), squeeze_series=True + ) + nunique = _register_default_pandas(pandas.DataFrame.nunique) + pivot = _register_default_pandas(pandas.DataFrame.pivot) + pivot_table = _register_default_pandas(pandas.DataFrame.pivot_table) + pow = _register_default_pandas(_register_binary("pow")) + prod = _register_default_pandas(pandas.DataFrame.prod) + prod_min_count = _register_default_pandas(pandas.DataFrame.prod) + quantile_for_list_of_values = _register_default_pandas(pandas.DataFrame.quantile) + quantile_for_single_value = _register_default_pandas(pandas.DataFrame.quantile) + query = _register_default_pandas(pandas.DataFrame.query) + radd = _register_default_pandas(_register_binary("radd")) + rank = _register_default_pandas(pandas.DataFrame.rank) + reindex = _register_default_pandas(_reindex) + repeat = _register_default_pandas(pandas.Series.repeat, is_series=True) + replace = _register_default_pandas(pandas.DataFrame.replace) + resample_agg_df = _register_default_pandas(_register_resample("agg")) + resample_agg_ser = _register_default_pandas( + _register_resample("agg"), is_series=True + ) + resample_app_df = _register_default_pandas(_register_resample("apply")) + resample_app_ser = _register_default_pandas( + _register_resample("apply"), is_series=True + ) + resample_asfreq = _register_default_pandas(_register_resample("asfreq")) + resample_backfill = _register_default_pandas(_register_resample("backfill")) + resample_bfill = _register_default_pandas(_register_resample("bfill")) + resample_count = _register_default_pandas(_register_resample("count")) + resample_ffill = _register_default_pandas(_register_resample("ffill")) + resample_fillna = _register_default_pandas(_register_resample("fillna")) + resample_first = _register_default_pandas(_register_resample("first")) + resample_get_group = _register_default_pandas(_register_resample("get_group")) + resample_interpolate = _register_default_pandas(_register_resample("interpolate")) + resample_last = _register_default_pandas(_register_resample("last")) + resample_max = _register_default_pandas(_register_resample("max")) + resample_mean = _register_default_pandas(_register_resample("mean")) + resample_median = _register_default_pandas(_register_resample("median")) + resample_min = _register_default_pandas(_register_resample("min")) + resample_nearest = _register_default_pandas(_register_resample("nearest")) + resample_nunique = _register_default_pandas(_register_resample("nunique")) + resample_ohlc_df = _register_default_pandas(_register_resample("ohlc")) + resample_ohlc_ser = _register_default_pandas( + _register_resample("ohlc"), is_series=True + ) + resample_pad = _register_default_pandas(_register_resample("pad")) + resample_pipe = _register_default_pandas(_register_resample("pipe")) + resample_prod = _register_default_pandas(_register_resample("prod")) + resample_quantile = _register_default_pandas(_register_resample("quantile")) + resample_sem = _register_default_pandas(_register_resample("sem")) + resample_size = _register_default_pandas(_register_resample("size")) + resample_std = _register_default_pandas(_register_resample("std")) + resample_sum = _register_default_pandas(_register_resample("sum")) + resample_transform = _register_default_pandas(_register_resample("transform")) + resample_var = _register_default_pandas(_register_resample("var")) + reset_index = _register_default_pandas(pandas.DataFrame.reset_index) + rfloordiv = _register_default_pandas(_register_binary("rfloordiv")) + rmod = _register_default_pandas(_register_binary("rmod")) + rolling_aggregate = _register_default_pandas(_rolling_func("aggregate")) + rolling_apply = _register_default_pandas(_rolling_func("apply")) + rolling_corr = _register_default_pandas(_rolling_func("corr")) + rolling_count = _register_default_pandas(_rolling_func("count")) + rolling_cov = _register_default_pandas(_rolling_func("cov")) + rolling_kurt = _register_default_pandas(_rolling_func("kurt")) + rolling_max = _register_default_pandas(_rolling_func("max")) + rolling_mean = _register_default_pandas(_rolling_func("mean")) + rolling_median = _register_default_pandas(_rolling_func("median")) + rolling_min = _register_default_pandas(_rolling_func("min")) + rolling_quantile = _register_default_pandas(_rolling_func("quantile")) + rolling_skew = _register_default_pandas(_rolling_func("skew")) + rolling_std = _register_default_pandas(_rolling_func("std")) + rolling_sum = _register_default_pandas(_rolling_func("sum")) + rolling_var = _register_default_pandas(_rolling_func("var")) + round = _register_default_pandas(pandas.DataFrame.round) + rmul = _register_default_pandas(_register_binary("rmul")) + rpow = _register_default_pandas(_register_binary("rpow")) + rsub = _register_default_pandas(_register_binary("rsub")) + rtruediv = _register_default_pandas(_register_binary("rtruediv")) + searchsorted = _register_default_pandas(pandas.Series.searchsorted, is_series=True) + sem = _register_default_pandas(pandas.DataFrame.sem) + series_update = _register_default_pandas( + pandas.Series.update, is_series=True, in_place=True, df_copy=True + ) + series_view = _register_default_pandas(pandas.Series.view, is_series=True) + set_index_from_columns = _register_default_pandas(pandas.DataFrame.set_index) + setitem = _register_default_pandas(_setitem) + skew = _register_default_pandas(pandas.DataFrame.skew) + sort_index = _register_default_pandas(_sort_index) + sort_columns_by_row_values = _register_default_pandas( + lambda df, columns, **kwargs: df.sort_values(by=columns, axis=1, **kwargs) + ) + sort_rows_by_column_values = _register_default_pandas( + lambda df, columns, **kwargs: df.sort_values(by=columns, axis=0, **kwargs) + ) + stack = _register_default_pandas(pandas.DataFrame.stack) + std = _register_default_pandas(pandas.DataFrame.std) + str___getitem__ = _register_default_pandas(_str_map("__getitem__")) + str_capitalize = _register_default_pandas(_str_map("capitalize")) + str_center = _register_default_pandas(_str_map("center")) + str_contains = _register_default_pandas(_str_map("contains")) + str_count = _register_default_pandas(_str_map("count")) + str_endswith = _register_default_pandas(_str_map("endswith")) + str_find = _register_default_pandas(_str_map("find")) + str_findall = _register_default_pandas(_str_map("findall")) + str_get = _register_default_pandas(_str_map("get")) + str_index = _register_default_pandas(_str_map("index")) + str_isalnum = _register_default_pandas(_str_map("isalnum")) + str_isalpha = _register_default_pandas(_str_map("isalpha")) + str_isdecimal = _register_default_pandas(_str_map("isdecimal")) + str_isdigit = _register_default_pandas(_str_map("isdigit")) + str_islower = _register_default_pandas(_str_map("islower")) + str_isnumeric = _register_default_pandas(_str_map("isnumeric")) + str_isspace = _register_default_pandas(_str_map("isspace")) + str_istitle = _register_default_pandas(_str_map("istitle")) + str_isupper = _register_default_pandas(_str_map("isupper")) + str_join = _register_default_pandas(_str_map("join")) + str_len = _register_default_pandas(_str_map("len")) + str_ljust = _register_default_pandas(_str_map("ljust")) + str_lower = _register_default_pandas(_str_map("lower")) + str_lstrip = _register_default_pandas(_str_map("lstrip")) + str_match = _register_default_pandas(_str_map("match")) + str_normalize = _register_default_pandas(_str_map("normalize")) + str_pad = _register_default_pandas(_str_map("pad")) + str_partition = _register_default_pandas(_str_map("partition")) + str_repeat = _register_default_pandas(_str_map("repeat")) + str_replace = _register_default_pandas(_str_map("replace")) + str_rfind = _register_default_pandas(_str_map("rfind")) + str_rindex = _register_default_pandas(_str_map("rindex")) + str_rjust = _register_default_pandas(_str_map("rjust")) + str_rpartition = _register_default_pandas(_str_map("rpartition")) + str_rsplit = _register_default_pandas(_str_map("rsplit")) + str_rstrip = _register_default_pandas(_str_map("rstrip")) + str_slice = _register_default_pandas(_str_map("slice")) + str_slice_replace = _register_default_pandas(_str_map("slice_replace")) + str_split = _register_default_pandas(_str_map("split")) + str_startswith = _register_default_pandas(_str_map("startswith")) + str_strip = _register_default_pandas(_str_map("strip")) + str_swapcase = _register_default_pandas(_str_map("swapcase")) + str_title = _register_default_pandas(_str_map("title")) + str_translate = _register_default_pandas(_str_map("translate")) + str_upper = _register_default_pandas(_str_map("upper")) + str_wrap = _register_default_pandas(_str_map("wrap")) + str_zfill = _register_default_pandas(_str_map("zfill")) + sub = _register_default_pandas(_register_binary("sub")) + sum = _register_default_pandas(pandas.DataFrame.sum) + sum_min_count = _register_default_pandas(pandas.DataFrame.sum) + take_2d = _register_default_pandas(_take_2d) + to_datetime = _register_default_pandas(_to_datetime) + to_numeric = _register_default_pandas(_to_numeric) + to_numpy = _register_default_pandas(pandas.DataFrame.to_numpy, return_modin=False) + to_timedelta = _register_default_pandas( + lambda ser, *args, **kwargs: pandas.to_timedelta(ser, *args, **kwargs), + is_series=True, + ) + transpose = _register_default_pandas(pandas.DataFrame.transpose) + truediv = _register_default_pandas(_register_binary("truediv")) + unique = _register_default_pandas(pandas.Series.unique, is_series=True) + unstack = _register_default_pandas(pandas.DataFrame.unstack) + var = _register_default_pandas(pandas.DataFrame.var) + where = _register_default_pandas(pandas.DataFrame.where) + window_mean = _register_default_pandas(_rolling_func("mean")) + window_std = _register_default_pandas(_rolling_func("std")) + window_sum = _register_default_pandas(_rolling_func("sum")) + window_var = _register_default_pandas(_rolling_func("var")) + write_items = _register_default_pandas(_write_items) + + T = property(transpose) + + _add_prefix_df = _register_default_pandas(pandas.DataFrame.add_prefix) + _add_prefix_series = _register_default_pandas( + pandas.Series.add_prefix, is_series=True + ) + + def add_prefix(self, prefix, axis=1): + if axis: + return self._add_prefix_df(prefix=prefix) + return self._add_prefix_series(prefix=prefix) + + _add_suffix_df = _register_default_pandas(pandas.DataFrame.add_suffix) + _add_suffix_series = _register_default_pandas( + pandas.Series.add_suffix, is_series=True + ) + + def add_suffix(self, suffix, axis=1): + if axis: + return self._add_suffix_df(suffix=suffix) + return self._add_suffix_series(suffix=suffix) + + def dot(self, other, squeeze_self=None, squeeze_other=None): + other = try_cast_to_pandas(other) + if squeeze_other: + other = other.squeeze() + if squeeze_self: + result = self._pandas_frame.squeeze(axis=1).dot(other) + else: + result = self._pandas_frame.dot(other) + if isinstance(result, pandas.Series): + if result.name is None: + result.name = "__reduced__" + result = result.to_frame() + if is_list_like(result): + result = pandas.DataFrame(result) + else: + result = pandas.DataFrame([result]) + + return self.__constructor__(result) + + def get_axis(self, axis): + return self._pandas_frame.index if axis == 0 else self._pandas_frame.columns + + def get_index_name(self, axis=0): + return self.get_axis(axis).name + + def get_index_names(self, axis=0): + return self.get_axis(axis).names + + def set_index_name(self, name, axis=0): + self.get_axis(axis).name = name + + def has_multiindex(self, axis=0): + if axis == 0: + return isinstance(self._pandas_frame.index, pandas.MultiIndex) + assert axis == 1 + return isinstance(self._pandas_frame.columns, pandas.MultiIndex) + + def insert_item(self, *args, **kwargs): + return + + def to_pandas(self): + return self._pandas_frame + + @classmethod + def from_pandas(cls, df, data_cls): + return cls(data_cls.from_pandas(df)) + + @classmethod + def from_arrow(cls, at, data_cls): + return + + def free(self): + return + + def finalize(self): + return + + # Dataframe exchange protocol + + def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): + return self._pandas_frame.__dataframe__( + nan_as_null=nan_as_null, allow_copy=allow_copy + ) + + @classmethod + def from_dataframe(cls, df, data_cls): + return cls(data_cls.from_dataframe(df)) + + # END Dataframe exchange protocol + + index = property(_get_axis(0), _set_axis(0)) + columns = property(_get_axis(1), _set_axis(1)) + + @property + def dtypes(self): + return self._pandas_frame.dtypes + + def getitem_column_array(self, key, numeric=False): + if numeric: + return self.__constructor__(self._pandas_frame.iloc[:, key]) + return self.__constructor__(self._pandas_frame.loc[:, key]) + + def columnarize(self): + if len(self._pandas_frame.columns) != 1 or ( + len(self._pandas_frame.index) == 1 + and self._pandas_frame.index[0] == MODIN_UNNAMED_SERIES_LABEL + ): + return SmallQueryCompiler(self._pandas_frame.transpose()) + return self + + def is_series_like(self): + return ( + len(self._pandas_frame.columns) == 1 or len(self._pandas_frame.index) == 1 + ) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index f7eebcd30f2..2a4ca5129fb 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -56,6 +56,9 @@ is_numeric_dtype, is_object_dtype, ) +from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( + SmallQueryCompiler, +) from pandas.core.indexes.api import ensure_index from pandas.core.methods.describe import _refine_percentiles from pandas.util._validators import ( @@ -283,6 +286,8 @@ def _build_repr_df( indexer = row_indexer, _get_repr_axis_label_indexer(self.columns, num_cols) else: indexer = row_indexer + if isinstance(self._query_compiler, SmallQueryCompiler): + return self._query_compiler.to_pandas().iloc[indexer] return self.iloc[indexer]._query_compiler.to_pandas() def _update_inplace(self, new_query_compiler: BaseQueryCompiler) -> None: diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index fe28d3680e0..ac2a8b6692d 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -57,8 +57,11 @@ from pandas.io.formats.info import DataFrameInfo from pandas.util._validators import validate_bool_kwarg -from modin.config import PersistentPickle +from modin.config import PersistentPickle,InitializeWithSmallQueryCompilers from modin.error_message import ErrorMessage +from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( + SmallQueryCompiler, +) from modin.logging import disable_logging from modin.pandas import Categorical from modin.pandas.io import from_non_pandas, from_pandas, to_pandas @@ -81,7 +84,7 @@ _doc_binary_op, cast_function_modin2pandas, ) - +from modin.core.storage_formats import BaseQueryCompiler if TYPE_CHECKING: from modin.core.storage_formats import BaseQueryCompiler @@ -147,12 +150,16 @@ def __init__( query_compiler: BaseQueryCompiler = None, ) -> None: from modin.numpy import array - # Siblings are other dataframes that share the same query compiler. We # use this list to update inplace when there is a shallow copy. self._siblings = [] + if query_compiler is not None: + if not isinstance(query_compiler, BaseQueryCompiler): + breakpoint() + print(type(query_compiler)) if isinstance(data, (DataFrame, Series)): - self._query_compiler = data._query_compiler.copy() + query_compiler = data._query_compiler.copy() + self._query_compiler = query_compiler if index is not None and any(i not in data.index for i in index): raise NotImplementedError( "Passing non-existant columns or index values to constructor not" @@ -203,6 +210,9 @@ def __init__( distributed_frame = from_non_pandas(data, index, columns, dtype) if distributed_frame is not None: self._query_compiler = distributed_frame._query_compiler + if self._query_compiler is not None: + if not isinstance(self._query_compiler, BaseQueryCompiler): + breakpoint() return warnings.warn( @@ -244,6 +254,9 @@ def __init__( new_qc = new_qc.reindex(axis=1, labels=columns) self._query_compiler = new_qc + if self._query_compiler is not None: + if not isinstance(self._query_compiler, BaseQueryCompiler): + breakpoint() return data = { @@ -256,6 +269,15 @@ def __init__( self._query_compiler = from_pandas(pandas_df)._query_compiler else: self._query_compiler = query_compiler + + if query_compiler is None and InitializeWithSmallQueryCompilers.get(): + small_dataframe = pandas.DataFrame( + data=data, index=index, columns=columns, dtype=dtype, copy=copy + ) + self._query_compiler = SmallQueryCompiler(small_dataframe) + if self._query_compiler is not None: + if not isinstance(self._query_compiler, BaseQueryCompiler): + breakpoint() def __repr__(self) -> str: """ @@ -3122,6 +3144,7 @@ def _to_pandas(self) -> pandas.DataFrame: ------- pandas.DataFrame """ + print(f"self._query_compiler {type(self._query_compiler)}") return self._query_compiler.to_pandas() def _validate_eval_query(self, expr, **kwargs) -> None: diff --git a/modin/pandas/io.py b/modin/pandas/io.py index 508d1b2a4d5..045d44f657e 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -44,6 +44,9 @@ ) import numpy as np +from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( + SmallQueryCompiler, +) import pandas from pandas._libs.lib import NoDefault, no_default from pandas._typing import ( @@ -64,7 +67,7 @@ from pandas.io.parsers import TextFileReader from pandas.io.parsers.readers import _c_parser_defaults -from modin.config import ModinNumpy +from modin.config import ModinNumpy, InitializeWithSmallQueryCompilers from modin.error_message import ErrorMessage from modin.logging import ClassLogger, enable_logging from modin.utils import ( @@ -990,6 +993,9 @@ def from_pandas(df) -> DataFrame: A new Modin DataFrame object. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher + + if InitializeWithSmallQueryCompilers.get(): + return ModinObjects.DataFrame(query_compiler=SmallQueryCompiler(df)) return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_pandas(df)) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 7818c52654d..652be3289a5 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -31,7 +31,10 @@ from pandas.io.formats.info import SeriesInfo from pandas.util._validators import validate_bool_kwarg -from modin.config import PersistentPickle +from modin.config import PersistentPickle,InitializeWithSmallQueryCompilers +from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( + SmallQueryCompiler, +) from modin.logging import disable_logging from modin.pandas.io import from_pandas, to_pandas from modin.utils import ( @@ -51,7 +54,7 @@ StructAccessor, ) from .utils import _doc_binary_op, cast_function_modin2pandas, is_scalar - +from modin.core.storage_formats import BaseQueryCompiler if TYPE_CHECKING: import numpy.typing as npt @@ -114,6 +117,8 @@ def __init__( # Siblings are other dataframes that share the same query compiler. We # use this list to update inplace when there is a shallow copy. self._siblings = [] + #breakpoint() + print(f"+++++++{type(query_compiler)}+") if isinstance(data, type(self)): query_compiler = data._query_compiler.copy() if index is not None: @@ -144,22 +149,37 @@ def __init__( name = MODIN_UNNAMED_SERIES_LABEL if isinstance(data, pandas.Series) and data.name is not None: name = data.name - - query_compiler = from_pandas( - pandas.DataFrame( - pandas.Series( - data=data, - index=index, - dtype=dtype, - name=name, - copy=copy, - fastpath=fastpath, + if InitializeWithSmallQueryCompilers.get(): + query_compiler = SmallQueryCompiler( + pandas.DataFrame( + pandas.Series( + data=data, + index=index, + dtype=dtype, + name=name, + copy=copy, + fastpath=fastpath, + ) ) ) - )._query_compiler + else: + query_compiler = from_pandas( + pandas.DataFrame( + pandas.Series( + data=data, + index=index, + dtype=dtype, + name=name, + copy=copy, + fastpath=fastpath, + ) + ) + )._query_compiler + breakpoint() self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name + def _get_name(self) -> Hashable: """ diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index ef7f199b57c..5479ccdc805 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -22,8 +22,10 @@ from pandas._typing import AggFuncType, AggFuncTypeBase, AggFuncTypeDict, IndexLabel from pandas.util._decorators import doc -from modin.utils import hashable - +from modin.config import InitializeWithSmallQueryCompilers +from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( + SmallQueryCompiler, +) _doc_binary_operation = """ Return {operation} of {left} and `{right}` (binary operator `{bin_op}`). diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index e153f9f892f..70cb6b769c0 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -17,7 +17,7 @@ import pytest import modin.pandas as pd -from modin.config import NPartitions, StorageFormat +from modin.config import NPartitions, StorageFormat, InitializeWithSmallQueryCompilers from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) @@ -210,6 +210,10 @@ def operation(df): StorageFormat.get() != "Pandas", reason="Modin on this engine doesn't create virtual partitions.", ) +@pytest.mark.skipif( + InitializeWithSmallQueryCompilers.get(), + reason="SmallQueryCompiler does not contain partitions.", +) @pytest.mark.parametrize( "left_virtual,right_virtual", [(True, False), (False, True), (True, True)] ) diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index bad7e54031b..a96e5f3745b 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -22,7 +22,7 @@ from numpy.testing import assert_array_equal import modin.pandas as pd -from modin.config import Engine, NPartitions, StorageFormat +from modin.config import Engine, NPartitions, StorageFormat, InitializeWithSmallQueryCompilers from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( axis_keys, @@ -122,7 +122,10 @@ def test_to_numpy(data): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) assert_array_equal(modin_df.values, pandas_df.values) - +@pytest.mark.skipif( + InitializeWithSmallQueryCompilers.get(), + reason="SmallQueryCompiler does not contain partitions.", +) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_partition_to_numpy(data): frame = pd.DataFrame(data) @@ -313,6 +316,10 @@ def test_corr_non_numeric(self, numeric_only): StorageFormat.get() != "Pandas", reason="doesn't make sense for non-partitioned executions", ) + @pytest.mark.skipif( + InitializeWithSmallQueryCompilers.get(), + reason="SmallQueryCompiler does not contain partitions.", + ) def test_corr_nans_in_different_partitions(self): # NaN in the first partition modin_df, pandas_df = create_test_dfs( diff --git a/modin/tests/pandas/dataframe/test_indexing.py b/modin/tests/pandas/dataframe/test_indexing.py index 52603343619..4d8aa74b788 100644 --- a/modin/tests/pandas/dataframe/test_indexing.py +++ b/modin/tests/pandas/dataframe/test_indexing.py @@ -21,7 +21,7 @@ from pandas._testing import ensure_clean import modin.pandas as pd -from modin.config import MinRowPartitionSize, NPartitions +from modin.config import MinRowPartitionSize, NPartitions, InitializeWithSmallQueryCompilers from modin.pandas.indexing import is_range_like from modin.pandas.testing import assert_index_equal from modin.tests.pandas.utils import ( @@ -585,7 +585,9 @@ def test_loc_setting_single_categorical_column(): pandas_df.loc[1:3, "status"] = "a" df_equals(modin_df, pandas_df) - +@pytest.mark.skipif( + InitializeWithSmallQueryCompilers.get(), + reason="SmallQueryCompiler does not currently support IO functions.",) def test_loc_multi_index(): modin_df = pd.read_csv( "modin/tests/pandas/data/blah.csv", header=[0, 1, 2, 3], index_col=0 diff --git a/modin/utils.py b/modin/utils.py index 34071be132b..c29227e1ba3 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -651,7 +651,7 @@ def hashable(obj: bool) -> bool: return True -def try_cast_to_pandas(obj: Any, squeeze: bool = False) -> Any: +def try_cast_to_pandas(obj: Any, squeeze: bool = False, squeeze_df: bool = False) -> Any: """ Convert `obj` and all nested objects from Modin to pandas if it is possible. @@ -663,6 +663,8 @@ def try_cast_to_pandas(obj: Any, squeeze: bool = False) -> Any: Object to convert from Modin to pandas. squeeze : bool, default: False Squeeze the converted object(s) before returning them. + squeeze_df : bool, default: False + Squeeze the converted DataFrame(s) if Series-like. Returns ------- @@ -673,7 +675,15 @@ def try_cast_to_pandas(obj: Any, squeeze: bool = False) -> Any: result = obj.modin.to_pandas() if hasattr(obj, "modin") else obj.to_pandas() if squeeze: result = result.squeeze(axis=1) - + if ( + squeeze_df + and isinstance(result, pandas.DataFrame) + and len(result.columns) == 1 + and result.columns[0] == MODIN_UNNAMED_SERIES_LABEL + ): + result = result.squeeze(axis=1) + + # QueryCompiler/low-level ModinFrame case, it doesn't have logic about convertion to Series if ( isinstance(getattr(result, "name", None), str) @@ -682,9 +692,11 @@ def try_cast_to_pandas(obj: Any, squeeze: bool = False) -> Any: result.name = None return result if isinstance(obj, (list, tuple)): - return type(obj)([try_cast_to_pandas(o, squeeze=squeeze) for o in obj]) + return type(obj)( + [try_cast_to_pandas(o, squeeze=squeeze, squeeze_df=squeeze_df) for o in obj] + ) if isinstance(obj, dict): - return {k: try_cast_to_pandas(v, squeeze=squeeze) for k, v in obj.items()} + return {k: try_cast_to_pandas(v, squeeze=squeeze, squeeze_df=squeeze_df) for k, v in obj.items()} if callable(obj): module_hierarchy = getattr(obj, "__module__", "").split(".") fn_name = getattr(obj, "__name__", None) From 340151353b8d1a2f5adfb4acb0ce27f36aaff2d2 Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Mon, 13 May 2024 13:27:05 -0500 Subject: [PATCH 02/19] fixing tests --- docs/conf.py | 4 +- modin/config/__init__.py | 5 +- modin/config/envvars.py | 2 + .../pandas/small_query_compiler.py | 490 +++++++++++++----- modin/pandas/base.py | 6 +- modin/pandas/dataframe.py | 21 +- modin/pandas/io.py | 10 +- modin/pandas/series.py | 8 +- modin/pandas/utils.py | 6 +- modin/tests/pandas/dataframe/test_default.py | 41 +- modin/tests/pandas/dataframe/test_indexing.py | 12 +- modin/tests/pandas/dataframe/test_pickle.py | 1 - modin/tests/pandas/test_expanding.py | 16 +- modin/utils.py | 12 +- setup.cfg | 3 + versioneer.py | 147 ++++++ 16 files changed, 610 insertions(+), 174 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 783eb11e414..61f49793efb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -48,7 +48,9 @@ def noop_decorator(*args, **kwargs): if not hasattr(sys.modules["unidist"].core.base, "object_ref"): sys.modules["unidist"].core.base.object_ref = type("object_ref", (object,), {}) if not hasattr(sys.modules["unidist"].core.base.object_ref, "ObjectRef"): - sys.modules["unidist"].core.base.object_ref.ObjectRef = type("ObjectRef", (object,), {}) + sys.modules["unidist"].core.base.object_ref.ObjectRef = type( + "ObjectRef", (object,), {} + ) sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) import modin diff --git a/modin/config/__init__.py b/modin/config/__init__.py index 4e6cf88ddd3..d81655b9dbd 100644 --- a/modin/config/__init__.py +++ b/modin/config/__init__.py @@ -27,6 +27,7 @@ EnvironmentVariable, GithubCI, GpuCount, + InitializeWithSmallQueryCompilers, IsDebug, IsExperimental, IsRayCluster, @@ -53,7 +54,7 @@ TestReadFromPostgres, TestReadFromSqlServer, TrackFileLeaks, - InitializeWithSmallQueryCompilers + ) from modin.config.pubsub import Parameter, ValueSource, context @@ -69,7 +70,7 @@ "CpuCount", "GpuCount", "Memory", - "PersistentPickle" + "InitializeWithSmallQueryCompilers", # Ray specific "IsRayCluster", "RayRedisAddress", diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 9b045992417..43bf5925471 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -911,6 +911,8 @@ def _check_vars() -> None: deprecated[depr_var].deprecation_message(use_envvar_names=True), FutureWarning, ) + + class InitializeWithSmallQueryCompilers(EnvironmentVariable, type=str): """Set to true to use implementation of SmallQueryCompiler.""" diff --git a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py index f92e9b4f9b9..7d5c6be6f00 100644 --- a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py +++ b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py @@ -18,17 +18,17 @@ queries for small data and empty ``PandasDataFrame``. """ -from modin.config.envvars import InitializeWithSmallQueryCompilers +import warnings + import numpy as np import pandas -from pandas.core.dtypes.common import ( - is_list_like, - is_scalar, -) +from pandas.core.dtypes.common import is_list_like, is_scalar +from modin.config.envvars import InitializeWithSmallQueryCompilers from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler -from modin.utils import MODIN_UNNAMED_SERIES_LABEL +from modin.error_message import ErrorMessage from modin.utils import ( + MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings, try_cast_to_pandas, ) @@ -262,7 +262,7 @@ def groupby_callable( agg_func=None, how="axis_wise", drop=False, - **kwargs + **kwargs, ): by_names = [] if isinstance(by, pandas.DataFrame): @@ -337,19 +337,65 @@ def _register_binary(op): """ def binary_operator(df, other, **kwargs): - if isinstance(other, pandas.DataFrame) and ( - not df.empty - or ( - len(other.columns) == 1 - and other.columns[0] == MODIN_UNNAMED_SERIES_LABEL - ) - ): - other = other.squeeze() + + # if isinstance(other, pandas.DataFrame) and ( + # not df.empty + # or ( + # len(other.columns) == 1 + # and other.columns[0] == MODIN_UNNAMED_SERIES_LABEL + # ) + # ): + # other = other.squeeze() + squeeze_other = kwargs.pop("broadcast", False) or kwargs.pop( + "squeeze_other", False + ) + squeeze_self = kwargs.pop("squeeze_self", False) + + if squeeze_other: + other = other.squeeze(axis=1) + + if squeeze_self: + df = df.squeeze(axis=1) + return getattr(df, op)(other, **kwargs) return binary_operator +def _register_exanding(func): + def binary_operator(df, fold_axis, rolling_args, *args, **kwargs): + # if + # other_for_default = ( + # other + # if other is None + # else ( + # other.to_pandas().squeeze(axis=1) + # if squeeze_other + # else other.to_pandas() + # ) + # ) + + # if isinstance(other, pandas.DataFrame) and ( + # not df.empty + # or ( + # len(other.columns) == 1 + # and other.columns[0] == MODIN_UNNAMED_SERIES_LABEL + # ) + # ): + # other = other.squeeze() + squeeze_self = kwargs.pop("squeeze_self", False) + + if squeeze_self: + df = df.squeeze(axis=1) + roller = df.expanding(*rolling_args) + if type(func) is property: + return func.fget(roller) + + return func(roller, *args, **kwargs) + + return binary_operator + + def _register_resample(op): """ Build function that apply specified resample method of the passed frame. @@ -383,10 +429,16 @@ def _drop(df, **kwargs): # noqa: GL08 return df -def _fillna(df, squeeze_self=True, squeeze_value=False, **kwargs): # noqa: GL08 - if len(df.columns) == 1 and df.columns[0] == "__reduced__": - df = df["__reduced__"] - return df.fillna(**kwargs) +def _fillna(df, value, **kwargs): # noqa: GL08 + squeeze_self = kwargs.pop("squeeze_self", False) + squeeze_value = kwargs.pop("squeeze_value", False) + if squeeze_self and isinstance(df, pandas.DataFrame): + df = df.squeeze(axis=1) + if squeeze_value and isinstance(value, pandas.DataFrame): + value = value.squeeze(axis=1) + # if len(df.columns) == 1 and df.columns[0] == "__reduced__": + # df = df["__reduced__"] + return df.fillna(value, **kwargs) def _is_monotonic(monotonic_type): # noqa: GL08 @@ -423,12 +475,28 @@ def _getitem_row_array(df, key): # noqa: GL08 def _write_items( - df, row_numeric_index, col_numeric_index, broadcasted_items + df, + row_numeric_index, + col_numeric_index, + broadcasted_items, + need_columns_reindex=True, ): # noqa: GL08 + from modin.pandas.utils import broadcast_item, is_scalar + if not isinstance(row_numeric_index, slice): row_numeric_index = list(row_numeric_index) if not isinstance(col_numeric_index, slice): col_numeric_index = list(col_numeric_index) + if not is_scalar(broadcasted_items): + broadcasted_items, _ = broadcast_item( + df, + row_numeric_index, + col_numeric_index, + broadcasted_items, + need_columns_reindex=need_columns_reindex, + ) + else: + broadcasted_items = broadcasted_items if isinstance(df.iloc[row_numeric_index, col_numeric_index], pandas.Series): broadcasted_items = broadcasted_items.squeeze() @@ -454,6 +522,85 @@ def _get_dummies(df, columns, **kwargs): # noqa: GL08 return pandas.get_dummies(df, columns=columns, **kwargs) +def _register_default_pandas( + func, + is_series=False, + squeeze_series=False, + squeeze_args=False, + squeeze_kwargs=False, + return_modin=True, + in_place=False, + df_copy=False, + filter_kwargs=[], +): + """ + Build function that apply specified method of the passed frame. + + Parameters + ---------- + func : callable + Function to apply. + is_series : bool, default: False + If True, the passed frame will always be squeezed to a series. + squeeze_series : bool, default: False + If True, the passed frame will always be squeezed to a series if there is a single column named "__reduced__". + squeeze_args : bool, default: False + If True, all passed arguments will be squeezed. + squeeze_kwargs : bool, default: False + If True, all passed key word arguments will be squeezed. + return_modin : bool, default: True + If True, the result will always try to convert to DataFrame or Series. + in_place : bool, default: False + If True, the specified function will be applied on the passed frame in place. + df_copy : bool, default: False + If True, the specified function will be applied to a copy of the passed frame. + filter_kwargs : list, default: [] + List of key word argument names to remove. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + + def caller(query_compiler, *args, **kwargs): + df = query_compiler._pandas_frame + if df_copy: + df = df.copy() + if is_series: + df = df.squeeze(axis=1) + elif ( + squeeze_series + and len(df.columns) == 1 + and df.columns[0] == MODIN_UNNAMED_SERIES_LABEL + ): + df = df.squeeze(axis=1) + exclude_names = [ + # "broadcast", + "fold_axis", + # "squeeze_self", + # "squeeze_value", + "ignore_indices", + ] + filter_kwargs + kwargs = kwargs.copy() + for name in exclude_names: + kwargs.pop(name, None) + args = try_cast_to_pandas(args, squeeze=squeeze_args, squeeze_df=True) + kwargs = try_cast_to_pandas(kwargs, squeeze=squeeze_kwargs, squeeze_df=True) + result = func(df, *args, **kwargs) + if in_place: + result = df + if not (return_modin or isinstance(result, (pandas.Series, pandas.DataFrame))): + return result + if isinstance(result, pandas.Series): + if result.name is None: + result.name = MODIN_UNNAMED_SERIES_LABEL + result = result.to_frame() + return query_compiler.__constructor__(result) + + return caller + + @_inherit_docstrings(BaseQueryCompiler) class SmallQueryCompiler(BaseQueryCompiler): """ @@ -479,106 +626,97 @@ def __init__(self, pandas_frame): self._pandas_frame = pandas_frame + # def default_to_pandas(self, pandas_op, *args, **kwargs): + # args = (a.to_pandas() if isinstance(a, type(self)) else a for a in args) + # kwargs = { + # k: v.to_pandas if isinstance(v, type(self)) else v + # for k, v in kwargs.items() + # } + # op_name = getattr(pandas_op, "__name__", str(pandas_op)) + # ErrorMessage.default_to_pandas(op_name) + + # result = pandas_op(self._pandas_frame, *args, **kwargs) + # if isinstance(result, pandas.Series): + # if result.name is None: + # result.name = MODIN_UNNAMED_SERIES_LABEL + # result = result.to_frame() + + # return result + def default_to_pandas(self, pandas_op, *args, **kwargs): - args = (a.to_pandas() if isinstance(a, type(self)) else a for a in args) - kwargs = { - k: v.to_pandas if isinstance(v, type(self)) else v - for k, v in kwargs.items() - } + """ + Do fallback to pandas for the passed function. - result = pandas_op(self._pandas_frame, *args, **kwargs) - if isinstance(result, pandas.Series): - if result.name is None: - result.name = MODIN_UNNAMED_SERIES_LABEL - result = result.to_frame() + Parameters + ---------- + pandas_op : callable(pandas.DataFrame) -> object + Function to apply to the casted to pandas frame. + *args : iterable + Positional arguments to pass to `pandas_op`. + **kwargs : dict + Key-value arguments to pass to `pandas_op`. - return result + Returns + ------- + BaseQueryCompiler + The result of the `pandas_op`, converted back to ``BaseQueryCompiler``. + """ + op_name = getattr(pandas_op, "__name__", str(pandas_op)) + ErrorMessage.default_to_pandas(op_name) + args = try_cast_to_pandas(args) + kwargs = try_cast_to_pandas(kwargs) + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + result = pandas_op(try_cast_to_pandas(self), *args, **kwargs) + if isinstance(result, (tuple, list)): + if "Series.tolist" in pandas_op.__name__: + # fast path: no need to iterate over the result from `tolist` function + return result + return [self.__wrap_in_qc(obj) for obj in result] + # breakpoint() + return type(self)(result) - def execute(self): - """Wait for all computations to complete without materializing data.""" - pass - - - - def _register_default_pandas( - func, - is_series=False, - squeeze_series=False, - squeeze_args=False, - squeeze_kwargs=False, - return_modin=True, - in_place=False, - df_copy=False, - filter_kwargs=[], - ): + def __wrap_in_qc(self, obj): """ - Build function that apply specified method of the passed frame. + Wrap `obj` in query compiler. Parameters ---------- - func : callable - Function to apply. - is_series : bool, default: False - If True, the passed frame will always be squeezed to a series. - squeeze_series : bool, default: False - If True, the passed frame will always be squeezed to a series if there is a single column named "__reduced__". - squeeze_args : bool, default: False - If True, all passed arguments will be squeezed. - squeeze_kwargs : bool, default: False - If True, all passed key word arguments will be squeezed. - return_modin : bool, default: True - If True, the result will always try to convert to DataFrame or Series. - in_place : bool, default: False - If True, the specified function will be applied on the passed frame in place. - df_copy : bool, default: False - If True, the specified function will be applied to a copy of the passed frame. - filter_kwargs : list, default: [] - List of key word argument names to remove. + obj : any + Object to wrap. Returns ------- - callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame - Function to be applied to the frame. + BaseQueryCompiler + Query compiler wrapping the object. """ + if isinstance(obj, pandas.Series): + if obj.name is None: + obj.name = MODIN_UNNAMED_SERIES_LABEL + obj = obj.to_frame() + if isinstance(obj, pandas.DataFrame): + return self.from_pandas(obj, type(self._pandas_frame)) + else: + return obj - def caller(query_compiler, *args, **kwargs): - df = query_compiler._pandas_frame - if df_copy: - df = df.copy() - if is_series: - df = df.squeeze(axis=1) - elif ( - squeeze_series - and len(df.columns) == 1 - and df.columns[0] == MODIN_UNNAMED_SERIES_LABEL - ): - df = df.squeeze(axis=1) - exclude_names = [ - "broadcast", - "fold_axis", - "squeeze_self", - "squeeze_value", - "ignore_indices" - ] + filter_kwargs - kwargs = kwargs.copy() - for name in exclude_names: - kwargs.pop(name, None) - args = try_cast_to_pandas(args, squeeze=squeeze_args, squeeze_df=True) - kwargs = try_cast_to_pandas(kwargs, squeeze=squeeze_kwargs, squeeze_df=True) - result = func(df, *args, **kwargs) - if in_place: - result = df - if not ( - return_modin or isinstance(result, (pandas.Series, pandas.DataFrame)) - ): - return result - if isinstance(result, pandas.Series): - if result.name is None: - result.name = MODIN_UNNAMED_SERIES_LABEL - result = result.to_frame() - return query_compiler.__constructor__(result) + def execute(self): + """Wait for all computations to complete without materializing data.""" + pass - return caller + def take_2d_positional(self, index=None, columns=None): + index = slice(None) if index is None else index + columns = slice(None) if columns is None else columns + self._pandas_frame.iloc[index, columns] + return self.__constructor__(self._pandas_frame.iloc[index, columns]) + + def copy(self): + return self.__constructor__(self._pandas_frame.copy()) + + def setitem_bool(self, row_loc, col_loc, item): + + self._pandas_frame.loc[row_loc._pandas_frame.squeeze(axis=1), col_loc] = item + return self.__constructor__(self._pandas_frame) __and__ = _register_default_pandas(pandas.DataFrame.__and__, squeeze_series=True) __dir__ = _register_default_pandas(pandas.DataFrame.__dir__) @@ -609,6 +747,7 @@ def caller(query_compiler, *args, **kwargs): apply_on_series = _register_default_pandas(pandas.Series.apply, is_series=True) applymap = _register_default_pandas(pandas.DataFrame.applymap) astype = _register_default_pandas(pandas.DataFrame.astype) + case_when = _register_default_pandas(pandas.Series.case_when) cat_codes = _register_default_pandas(lambda ser: ser.cat.codes, is_series=True) clip = _register_default_pandas(pandas.DataFrame.clip) combine = _register_default_pandas(_combine, squeeze_series=True) @@ -621,7 +760,6 @@ def caller(query_compiler, *args, **kwargs): lambda df, *args, **kwargs: pandas.DataFrame(np.conj(df)) ) convert_dtypes = _register_default_pandas(pandas.DataFrame.convert_dtypes) - copy = _register_default_pandas(pandas.DataFrame.copy) count = _register_default_pandas(pandas.DataFrame.count) corr = _register_default_pandas(pandas.DataFrame.corr) cov = _register_default_pandas(pandas.DataFrame.cov) @@ -691,15 +829,60 @@ def caller(query_compiler, *args, **kwargs): dt_weekday = _register_default_pandas(_dt_prop_map("weekday")) dt_weekofyear = _register_default_pandas(_dt_prop_map("weekofyear")) dt_year = _register_default_pandas(_dt_prop_map("year")) + duplicated = _register_default_pandas(pandas.DataFrame.duplicated) eq = _register_default_pandas(_register_binary("eq"), filter_kwargs=["dtypes"]) + equals = _register_default_pandas(_register_binary("equals")) eval = _register_default_pandas(pandas.DataFrame.eval) explode = _register_default_pandas(pandas.DataFrame.explode) + expanding_count = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.count) + ) + expanding_sum = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.sum) + ) + expanding_mean = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.mean) + ) + expanding_median = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.median) + ) + expanding_std = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.std) + ) + expanding_min = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.min) + ) + expanding_max = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.max) + ) + expanding_skew = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.skew) + ) + expanding_kurt = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.kurt) + ) + expanding_sem = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.sem) + ) + expanding_quantile = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.quantile) + ) + expanding_aggregate = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.aggregate) + ) + expanding_var = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.var) + ) + expanding_rank = _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.rank) + ) + fillna = _register_default_pandas(_fillna) first_valid_index = _register_default_pandas( pandas.DataFrame.first_valid_index, return_modin=False ) floordiv = _register_default_pandas(_register_binary("floordiv")) - ge = _register_default_pandas(pandas.DataFrame.ge, filter_kwargs=["dtypes"]) + ge = _register_default_pandas(_register_binary("ge"), filter_kwargs=["dtypes"]) get_dummies = _register_default_pandas(_get_dummies) getitem_array = _register_default_pandas(_getitem_array) getitem_row_array = _register_default_pandas(_getitem_row_array) @@ -727,7 +910,7 @@ def caller(query_compiler, *args, **kwargs): groupby_std = _register_default_pandas(_groupby("std")) groupby_sum = _register_default_pandas(_groupby("sum")) groupby_var = _register_default_pandas(_groupby("var")) - gt = _register_default_pandas(pandas.DataFrame.gt, filter_kwargs=["dtypes"]) + gt = _register_default_pandas(_register_binary("gt"), filter_kwargs=["dtypes"]) idxmax = _register_default_pandas(pandas.DataFrame.idxmax) idxmin = _register_default_pandas(pandas.DataFrame.idxmin) infer_objects = _register_default_pandas( @@ -753,10 +936,12 @@ def caller(query_compiler, *args, **kwargs): last_valid_index = _register_default_pandas( pandas.DataFrame.last_valid_index, return_modin=False ) - le = _register_default_pandas(pandas.DataFrame.le, filter_kwargs=["dtypes"]) - lt = _register_default_pandas(pandas.DataFrame.lt, filter_kwargs=["dtypes"]) - #mad = _register_default_pandas(pandas.DataFrame.mad) + le = _register_default_pandas(_register_binary("le"), filter_kwargs=["dtypes"]) + lt = _register_default_pandas(_register_binary("lt"), filter_kwargs=["dtypes"]) + # mad = _register_default_pandas(pandas.DataFrame.mad) + mask = _register_default_pandas(pandas.DataFrame.mask) max = _register_default_pandas(pandas.DataFrame.max) + map = _register_default_pandas(pandas.DataFrame.map) mean = _register_default_pandas(pandas.DataFrame.mean) median = _register_default_pandas(pandas.DataFrame.median) melt = _register_default_pandas(pandas.DataFrame.melt) @@ -766,7 +951,7 @@ def caller(query_compiler, *args, **kwargs): mod = _register_default_pandas(_register_binary("mod")) mode = _register_default_pandas(pandas.DataFrame.mode) mul = _register_default_pandas(_register_binary("mul")) - ne = _register_default_pandas(pandas.DataFrame.ne, filter_kwargs=["dtypes"]) + ne = _register_default_pandas(_register_binary("ne"), filter_kwargs=["dtypes"]) negative = _register_default_pandas(pandas.DataFrame.__neg__) nlargest = _register_default_pandas(pandas.DataFrame.nlargest) notna = _register_default_pandas(pandas.DataFrame.notna) @@ -925,7 +1110,7 @@ def caller(query_compiler, *args, **kwargs): is_series=True, ) transpose = _register_default_pandas(pandas.DataFrame.transpose) - truediv = _register_default_pandas(_register_binary("truediv")) + truediv = _register_default_pandas(_register_binary("truediv"), squeeze_series=True) unique = _register_default_pandas(pandas.Series.unique, is_series=True) unstack = _register_default_pandas(pandas.DataFrame.unstack) var = _register_default_pandas(pandas.DataFrame.var) @@ -977,6 +1162,78 @@ def dot(self, other, squeeze_self=None, squeeze_other=None): return self.__constructor__(result) + def expanding_cov( + self, + fold_axis, + expanding_args, + squeeze_self, + squeeze_other, + other=None, + pairwise=None, + ddof=1, + numeric_only=False, + **kwargs, + ): + other_for_default = ( + other + if other is None + else ( + other.to_pandas().squeeze(axis=1) + if squeeze_other + else other.to_pandas() + ) + ) + # expanding_rank = _register_default_pandas(_register_exanding(pandas.core.window.expanding.Expanding.rank)) + + return _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.cov) + )( + self, + fold_axis, + expanding_args, + other=other_for_default, + pairwise=pairwise, + ddof=ddof, + numeric_only=numeric_only, + squeeze_self=squeeze_self, + **kwargs, + ) + + def expanding_corr( + self, + fold_axis, + expanding_args, + squeeze_self, + squeeze_other, + other=None, + pairwise=None, + ddof=1, + numeric_only=False, + **kwargs, + ): + other_for_default = ( + other + if other is None + else ( + other.to_pandas().squeeze(axis=1) + if squeeze_other + else other.to_pandas() + ) + ) + return _register_default_pandas( + _register_exanding(pandas.core.window.expanding.Expanding.corr) + )( + self, + fold_axis, + expanding_args, + other=other_for_default, + pairwise=pairwise, + ddof=ddof, + numeric_only=numeric_only, + squeeze_self=squeeze_self, + **kwargs, + ) + def get_axis(self, axis): return self._pandas_frame.index if axis == 0 else self._pandas_frame.columns @@ -995,15 +1252,12 @@ def has_multiindex(self, axis=0): assert axis == 1 return isinstance(self._pandas_frame.columns, pandas.MultiIndex) - def insert_item(self, *args, **kwargs): - return - def to_pandas(self): return self._pandas_frame @classmethod def from_pandas(cls, df, data_cls): - return cls(data_cls.from_pandas(df)) + return cls(df) @classmethod def from_arrow(cls, at, data_cls): diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 2a4ca5129fb..bc9e89ba2bb 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -56,9 +56,6 @@ is_numeric_dtype, is_object_dtype, ) -from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - SmallQueryCompiler, -) from pandas.core.indexes.api import ensure_index from pandas.core.methods.describe import _refine_percentiles from pandas.util._validators import ( @@ -69,6 +66,9 @@ from modin import pandas as pd from modin.error_message import ErrorMessage +from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( + SmallQueryCompiler, +) from modin.logging import ClassLogger, disable_logging from modin.pandas.accessor import CachedAccessor, ModinAPI from modin.pandas.utils import is_scalar diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index ac2a8b6692d..818ea4238d9 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -57,7 +57,7 @@ from pandas.io.formats.info import DataFrameInfo from pandas.util._validators import validate_bool_kwarg -from modin.config import PersistentPickle,InitializeWithSmallQueryCompilers +from modin.config import InitializeWithSmallQueryCompilers, PersistentPickle from modin.error_message import ErrorMessage from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( SmallQueryCompiler, @@ -84,7 +84,7 @@ _doc_binary_op, cast_function_modin2pandas, ) -from modin.core.storage_formats import BaseQueryCompiler + if TYPE_CHECKING: from modin.core.storage_formats import BaseQueryCompiler @@ -150,13 +150,10 @@ def __init__( query_compiler: BaseQueryCompiler = None, ) -> None: from modin.numpy import array + # Siblings are other dataframes that share the same query compiler. We # use this list to update inplace when there is a shallow copy. self._siblings = [] - if query_compiler is not None: - if not isinstance(query_compiler, BaseQueryCompiler): - breakpoint() - print(type(query_compiler)) if isinstance(data, (DataFrame, Series)): query_compiler = data._query_compiler.copy() self._query_compiler = query_compiler @@ -210,9 +207,6 @@ def __init__( distributed_frame = from_non_pandas(data, index, columns, dtype) if distributed_frame is not None: self._query_compiler = distributed_frame._query_compiler - if self._query_compiler is not None: - if not isinstance(self._query_compiler, BaseQueryCompiler): - breakpoint() return warnings.warn( @@ -254,9 +248,6 @@ def __init__( new_qc = new_qc.reindex(axis=1, labels=columns) self._query_compiler = new_qc - if self._query_compiler is not None: - if not isinstance(self._query_compiler, BaseQueryCompiler): - breakpoint() return data = { @@ -269,15 +260,12 @@ def __init__( self._query_compiler = from_pandas(pandas_df)._query_compiler else: self._query_compiler = query_compiler - + if query_compiler is None and InitializeWithSmallQueryCompilers.get(): small_dataframe = pandas.DataFrame( data=data, index=index, columns=columns, dtype=dtype, copy=copy ) self._query_compiler = SmallQueryCompiler(small_dataframe) - if self._query_compiler is not None: - if not isinstance(self._query_compiler, BaseQueryCompiler): - breakpoint() def __repr__(self) -> str: """ @@ -3144,7 +3132,6 @@ def _to_pandas(self) -> pandas.DataFrame: ------- pandas.DataFrame """ - print(f"self._query_compiler {type(self._query_compiler)}") return self._query_compiler.to_pandas() def _validate_eval_query(self, expr, **kwargs) -> None: diff --git a/modin/pandas/io.py b/modin/pandas/io.py index 045d44f657e..13bea82a210 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -44,9 +44,6 @@ ) import numpy as np -from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - SmallQueryCompiler, -) import pandas from pandas._libs.lib import NoDefault, no_default from pandas._typing import ( @@ -69,6 +66,9 @@ from modin.config import ModinNumpy, InitializeWithSmallQueryCompilers from modin.error_message import ErrorMessage +from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( + SmallQueryCompiler, +) from modin.logging import ClassLogger, enable_logging from modin.utils import ( SupportsPrivateToNumPy, @@ -993,8 +993,8 @@ def from_pandas(df) -> DataFrame: A new Modin DataFrame object. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher - - if InitializeWithSmallQueryCompilers.get(): + + if InitializeWithSmallQueryCompilers.get(): return ModinObjects.DataFrame(query_compiler=SmallQueryCompiler(df)) return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_pandas(df)) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 652be3289a5..ede267bfedb 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -31,7 +31,7 @@ from pandas.io.formats.info import SeriesInfo from pandas.util._validators import validate_bool_kwarg -from modin.config import PersistentPickle,InitializeWithSmallQueryCompilers +from modin.config import InitializeWithSmallQueryCompilers, PersistentPickle from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( SmallQueryCompiler, ) @@ -54,7 +54,7 @@ StructAccessor, ) from .utils import _doc_binary_op, cast_function_modin2pandas, is_scalar -from modin.core.storage_formats import BaseQueryCompiler + if TYPE_CHECKING: import numpy.typing as npt @@ -117,8 +117,6 @@ def __init__( # Siblings are other dataframes that share the same query compiler. We # use this list to update inplace when there is a shallow copy. self._siblings = [] - #breakpoint() - print(f"+++++++{type(query_compiler)}+") if isinstance(data, type(self)): query_compiler = data._query_compiler.copy() if index is not None: @@ -175,11 +173,9 @@ def __init__( ) ) )._query_compiler - breakpoint() self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name - def _get_name(self) -> Hashable: """ diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index 5479ccdc805..09b19637e88 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -22,10 +22,14 @@ from pandas._typing import AggFuncType, AggFuncTypeBase, AggFuncTypeDict, IndexLabel from pandas.util._decorators import doc -from modin.config import InitializeWithSmallQueryCompilers + + from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( SmallQueryCompiler, ) + +from modin.utils import hashable + _doc_binary_operation = """ Return {operation} of {left} and `{right}` (binary operator `{bin_op}`). diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index a96e5f3745b..87bb1e15bf3 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -11,6 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +import contextlib import io import warnings @@ -22,7 +23,12 @@ from numpy.testing import assert_array_equal import modin.pandas as pd -from modin.config import Engine, NPartitions, StorageFormat, InitializeWithSmallQueryCompilers +from modin.config import ( + Engine, + InitializeWithSmallQueryCompilers, + NPartitions, + StorageFormat, +) from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( axis_keys, @@ -87,7 +93,11 @@ ) def test_ops_defaulting_to_pandas(op, make_args): modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"], axis=1) - with warns_that_defaulting_to_pandas(): + with ( + warns_that_defaulting_to_pandas() + if not InitializeWithSmallQueryCompilers.get() + else contextlib.nullcontext() + ): operation = getattr(modin_df, op) if make_args is not None: operation(**make_args(modin_df)) @@ -101,7 +111,11 @@ def test_ops_defaulting_to_pandas(op, make_args): def test_style(): data = test_data_values[0] - with warns_that_defaulting_to_pandas(): + with ( + warns_that_defaulting_to_pandas() + if not InitializeWithSmallQueryCompilers.get() + else contextlib.nullcontext() + ): pd.DataFrame(data).style @@ -109,7 +123,11 @@ def test_to_timestamp(): idx = pd.date_range("1/1/2012", periods=5, freq="M") df = pd.DataFrame(np.random.randint(0, 100, size=(len(idx), 4)), index=idx) - with warns_that_defaulting_to_pandas(): + with ( + warns_that_defaulting_to_pandas() + if not InitializeWithSmallQueryCompilers.get() + else contextlib.nullcontext() + ): df.to_period().to_timestamp() @@ -122,6 +140,7 @@ def test_to_numpy(data): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) assert_array_equal(modin_df.values, pandas_df.values) + @pytest.mark.skipif( InitializeWithSmallQueryCompilers.get(), reason="SmallQueryCompiler does not contain partitions.", @@ -137,7 +156,11 @@ def test_asfreq(): index = pd.date_range("1/1/2000", periods=4, freq="min") series = pd.Series([0.0, None, 2.0, 3.0], index=index) df = pd.DataFrame({"s": series}) - with warns_that_defaulting_to_pandas(): + with ( + warns_that_defaulting_to_pandas() + if not InitializeWithSmallQueryCompilers.get() + else contextlib.nullcontext() + ): # We are only testing that this defaults to pandas, so we will just check for # the warning df.asfreq(freq="30S") @@ -297,8 +320,8 @@ def test_corr_min_periods(self, min_periods): {"a": [1, np.nan, 3, 4, 5, 6], "b": [1, 2, 1, 4, 5, np.nan]} ) modin_df = pd.concat([modin_df.iloc[:3], modin_df.iloc[3:]]) - - assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) + if not InitializeWithSmallQueryCompilers.get(): + assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( modin_df, pandas_df, lambda df: df.corr(min_periods=min_periods) ) @@ -317,8 +340,8 @@ def test_corr_non_numeric(self, numeric_only): reason="doesn't make sense for non-partitioned executions", ) @pytest.mark.skipif( - InitializeWithSmallQueryCompilers.get(), - reason="SmallQueryCompiler does not contain partitions.", + InitializeWithSmallQueryCompilers.get(), + reason="SmallQueryCompiler does not contain partitions.", ) def test_corr_nans_in_different_partitions(self): # NaN in the first partition diff --git a/modin/tests/pandas/dataframe/test_indexing.py b/modin/tests/pandas/dataframe/test_indexing.py index 4d8aa74b788..5b3b56d1a51 100644 --- a/modin/tests/pandas/dataframe/test_indexing.py +++ b/modin/tests/pandas/dataframe/test_indexing.py @@ -63,7 +63,6 @@ def eval_setitem(md_df, pd_df, value, col=None, loc=None, expected_exception=Non col = pd_df.columns[loc] value_getter = value if callable(value) else (lambda *args, **kwargs: value) - eval_general( md_df, pd_df, @@ -80,7 +79,6 @@ def eval_loc(md_df, pd_df, value, key): md_value, pd_value = value else: md_value, pd_value = value, value - eval_general( md_df, pd_df, @@ -523,7 +521,6 @@ def test_loc_4456( if reverse_value_columns: pdf_value = pdf_value.reindex(columns=pdf_value.columns[::-1]) mdf_value = mdf_value.reindex(columns=mdf_value.columns[::-1]) - eval_loc(modin_df, pandas_df, pdf_value, key) eval_loc(modin_df, pandas_df, (mdf_value, pdf_value), key) @@ -585,9 +582,11 @@ def test_loc_setting_single_categorical_column(): pandas_df.loc[1:3, "status"] = "a" df_equals(modin_df, pandas_df) + @pytest.mark.skipif( InitializeWithSmallQueryCompilers.get(), - reason="SmallQueryCompiler does not currently support IO functions.",) + reason="SmallQueryCompiler does not currently support IO functions.", +) def test_loc_multi_index(): modin_df = pd.read_csv( "modin/tests/pandas/data/blah.csv", header=[0, 1, 2, 3], index_col=0 @@ -1484,6 +1483,7 @@ def test_reset_index(data, test_async_reset_index): pd_df_cp = pandas_df.copy() if test_async_reset_index: modin_df._query_compiler.set_frame_index_cache(None) + modin_df_cp.reset_index(inplace=True) pd_df_cp.reset_index(inplace=True) df_equals(modin_df_cp, pd_df_cp) @@ -2240,6 +2240,10 @@ def test___setitem__partitions_aligning(): df_equals(md_df, pd_df) +@pytest.mark.skipif( + InitializeWithSmallQueryCompilers.get(), + reason="SmallQueryCompiler does not currently support IO functions.", +) def test___setitem__with_mismatched_partitions(): with ensure_clean(".csv") as fname: np.savetxt(fname, np.random.randint(0, 100, size=(200_000, 99)), delimiter=",") diff --git a/modin/tests/pandas/dataframe/test_pickle.py b/modin/tests/pandas/dataframe/test_pickle.py index 97c78c9cd74..5450ca4f26c 100644 --- a/modin/tests/pandas/dataframe/test_pickle.py +++ b/modin/tests/pandas/dataframe/test_pickle.py @@ -52,7 +52,6 @@ def test__reduce__(): # pre-processed for the distributed engine. dataframe_data = ["Major League Baseball", "National Basketball Association"] abbr_md, abbr_pd = create_test_dfs(dataframe_data, index=["MLB", "NBA"]) - # breakpoint() dataframe_data = { "name": ["Mariners", "Lakers"] * 500, diff --git a/modin/tests/pandas/test_expanding.py b/modin/tests/pandas/test_expanding.py index fe184dbd249..85e08595a89 100644 --- a/modin/tests/pandas/test_expanding.py +++ b/modin/tests/pandas/test_expanding.py @@ -11,12 +11,14 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +import contextlib + import numpy as np import pandas import pytest import modin.pandas as pd -from modin.config import NPartitions +from modin.config import InitializeWithSmallQueryCompilers, NPartitions from modin.tests.test_utils import warns_that_defaulting_to_pandas from .utils import ( @@ -67,7 +69,11 @@ def test_dataframe(data, min_periods, axis, method, kwargs): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("method", ["corr", "cov"]) def test_dataframe_corr_cov(data, min_periods, axis, method): - with warns_that_defaulting_to_pandas(): + with ( + warns_that_defaulting_to_pandas() + if not InitializeWithSmallQueryCompilers.get() + else contextlib.nullcontext() + ): eval_general( *create_test_dfs(data), lambda df: getattr( @@ -79,7 +85,11 @@ def test_dataframe_corr_cov(data, min_periods, axis, method): @pytest.mark.parametrize("method", ["corr", "cov"]) def test_dataframe_corr_cov_with_self(method): mdf, pdf = create_test_dfs(test_data["float_nan_data"]) - with warns_that_defaulting_to_pandas(): + with ( + warns_that_defaulting_to_pandas() + if not InitializeWithSmallQueryCompilers.get() + else contextlib.nullcontext() + ): eval_general( mdf, pdf, diff --git a/modin/utils.py b/modin/utils.py index c29227e1ba3..32c7b7a9a5a 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -651,7 +651,9 @@ def hashable(obj: bool) -> bool: return True -def try_cast_to_pandas(obj: Any, squeeze: bool = False, squeeze_df: bool = False) -> Any: +def try_cast_to_pandas( + obj: Any, squeeze: bool = False, squeeze_df: bool = False +) -> Any: """ Convert `obj` and all nested objects from Modin to pandas if it is possible. @@ -682,8 +684,7 @@ def try_cast_to_pandas(obj: Any, squeeze: bool = False, squeeze_df: bool = False and result.columns[0] == MODIN_UNNAMED_SERIES_LABEL ): result = result.squeeze(axis=1) - - + # QueryCompiler/low-level ModinFrame case, it doesn't have logic about convertion to Series if ( isinstance(getattr(result, "name", None), str) @@ -696,7 +697,10 @@ def try_cast_to_pandas(obj: Any, squeeze: bool = False, squeeze_df: bool = False [try_cast_to_pandas(o, squeeze=squeeze, squeeze_df=squeeze_df) for o in obj] ) if isinstance(obj, dict): - return {k: try_cast_to_pandas(v, squeeze=squeeze, squeeze_df=squeeze_df) for k, v in obj.items()} + return { + k: try_cast_to_pandas(v, squeeze=squeeze, squeeze_df=squeeze_df) + for k, v in obj.items() + } if callable(obj): module_hierarchy = getattr(obj, "__module__", "").split(".") fn_name = getattr(obj, "__name__", None) diff --git a/setup.cfg b/setup.cfg index 4296cb97859..e6878fbaf72 100644 --- a/setup.cfg +++ b/setup.cfg @@ -68,3 +68,6 @@ exclude_lines = raise ImportError assert pass + +[pytest] +addopts = --env=MODIN_SMALL_QUERY_COMPILER=True \ No newline at end of file diff --git a/versioneer.py b/versioneer.py index 0ae83dbaaf9..dabe8ac1dff 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1,4 +1,8 @@ +<<<<<<< HEAD # Version: 0.29 +======= +# Version: 0.18 +>>>>>>> 4e967422 (fixing tests) """The Versioneer - like a rocketeer, but for versions. @@ -360,11 +364,15 @@ def get_root() -> str: setup_py = os.path.join(root, "setup.py") pyproject_toml = os.path.join(root, "pyproject.toml") versioneer_py = os.path.join(root, "versioneer.py") +<<<<<<< HEAD if not ( os.path.exists(setup_py) or os.path.exists(pyproject_toml) or os.path.exists(versioneer_py) ): +======= + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): +>>>>>>> 4e967422 (fixing tests) err = ( "Versioneer was unable to run the project root directory. " "Versioneer requires setup.py to be executed from " @@ -383,10 +391,17 @@ def get_root() -> str: my_path = os.path.realpath(os.path.abspath(__file__)) me_dir = os.path.normcase(os.path.splitext(my_path)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) +<<<<<<< HEAD if me_dir != vsr_dir and "VERSIONEER_PEP518" not in globals(): print( "Warning: build in %s is using versioneer.py from %s" % (os.path.dirname(my_path), versioneer_py) +======= + if me_dir != vsr_dir: + print( + "Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py) +>>>>>>> 4e967422 (fixing tests) ) except NameError: pass @@ -423,6 +438,14 @@ def get_config_from_root(root: str) -> VersioneerConfig: # common VersioneerConfig users at the moment. We verify against # `None` values elsewhere where it matters +<<<<<<< HEAD +======= + def get(parser, name): + if parser.has_option("versioneer", name): + return parser.get("versioneer", name) + return None + +>>>>>>> 4e967422 (fixing tests) cfg = VersioneerConfig() cfg.VCS = section["VCS"] cfg.style = section.get("style", "") @@ -450,10 +473,17 @@ class NotThisMethod(Exception): HANDLERS: Dict[str, Dict[str, Callable]] = {} +<<<<<<< HEAD def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" def decorate(f: Callable) -> Callable: +======= +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + + def decorate(f): +>>>>>>> 4e967422 (fixing tests) """Store f in HANDLERS[vcs][method].""" HANDLERS.setdefault(vcs, {})[method] = f return f @@ -461,6 +491,7 @@ def decorate(f: Callable) -> Callable: return decorate +<<<<<<< HEAD def run_command( commands: List[str], args: List[str], @@ -469,6 +500,9 @@ def run_command( hide_stderr: bool = False, env: Optional[Dict[str, str]] = None, ) -> Tuple[Optional[str], Optional[int]]: +======= +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): +>>>>>>> 4e967422 (fixing tests) """Call the given command(s).""" assert isinstance(commands, list) process = None @@ -484,13 +518,21 @@ def run_command( try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git +<<<<<<< HEAD process = subprocess.Popen( [command] + args, +======= + p = subprocess.Popen( + [c] + args, +>>>>>>> 4e967422 (fixing tests) cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), +<<<<<<< HEAD **popen_kwargs, +======= +>>>>>>> 4e967422 (fixing tests) ) break except OSError as e: @@ -515,7 +557,11 @@ def run_command( LONG_VERSION_PY[ "git" +<<<<<<< HEAD ] = r''' +======= +] = ''' +>>>>>>> 4e967422 (fixing tests) # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -1260,7 +1306,11 @@ def git_versions_from_keywords( # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " +<<<<<<< HEAD tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} +======= + tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) +>>>>>>> 4e967422 (fixing tests) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -1269,7 +1319,11 @@ def git_versions_from_keywords( # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". +<<<<<<< HEAD tags = {r for r in refs if re.search(r"\d", r)} +======= + tags = set([r for r in refs if re.search(r"\d", r)]) +>>>>>>> 4e967422 (fixing tests) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -1278,11 +1332,14 @@ def git_versions_from_keywords( # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] +<<<<<<< HEAD # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') if not re.match(r"\d", r): continue +======= +>>>>>>> 4e967422 (fixing tests) if verbose: print("picking %s" % r) return { @@ -1318,6 +1375,7 @@ def git_pieces_from_vcs( if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] +<<<<<<< HEAD # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, # but that should not change where we get our version from. @@ -1326,6 +1384,9 @@ def git_pieces_from_vcs( runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) +======= + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) +>>>>>>> 4e967422 (fixing tests) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -1333,7 +1394,11 @@ def git_pieces_from_vcs( # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) +<<<<<<< HEAD describe_out, rc = runner( +======= + describe_out, rc = run_command( +>>>>>>> 4e967422 (fixing tests) GITS, [ "describe", @@ -1342,7 +1407,11 @@ def git_pieces_from_vcs( "--always", "--long", "--match", +<<<<<<< HEAD f"{tag_prefix}[[:digit:]]*", +======= + "%s*" % tag_prefix, +>>>>>>> 4e967422 (fixing tests) ], cwd=root, ) @@ -1408,7 +1477,11 @@ def git_pieces_from_vcs( # TAG-NUM-gHEX mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: +<<<<<<< HEAD # unparsable. Maybe git-describe is misbehaving? +======= + # unparseable. Maybe git-describe is misbehaving? +>>>>>>> 4e967422 (fixing tests) pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces @@ -1434,6 +1507,7 @@ def git_pieces_from_vcs( else: # HEX: no tags pieces["closest-tag"] = None +<<<<<<< HEAD out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) pieces["distance"] = len(out.split()) # total number of commits @@ -1442,6 +1516,15 @@ def git_pieces_from_vcs( # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] +======= + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() +>>>>>>> 4e967422 (fixing tests) pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces @@ -1508,8 +1591,14 @@ def versions_from_parentdir( "error": None, "date": None, } +<<<<<<< HEAD rootdirs.append(root) root = os.path.dirname(root) # up a level +======= + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level +>>>>>>> 4e967422 (fixing tests) if verbose: print( @@ -1558,6 +1647,10 @@ def versions_from_file(filename: str) -> Dict[str, Any]: def write_to_version_file(filename: str, versions: Dict[str, Any]) -> None: """Write the given version number to the given _version.py file.""" +<<<<<<< HEAD +======= + os.unlink(filename) +>>>>>>> 4e967422 (fixing tests) contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) @@ -1838,7 +1931,11 @@ def get_versions(verbose: bool = False) -> Dict[str, Any]: assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS +<<<<<<< HEAD verbose = verbose or bool(cfg.verbose) # `bool()` used to avoid `None` +======= + verbose = verbose or cfg.verbose +>>>>>>> 4e967422 (fixing tests) assert ( cfg.versionfile_source is not None ), "please set versioneer.versionfile_source" @@ -2230,8 +2327,17 @@ def do_setup() -> int: root = get_root() try: cfg = get_config_from_root(root) +<<<<<<< HEAD except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: if isinstance(e, (OSError, configparser.NoSectionError)): +======= + except ( + EnvironmentError, + configparser.NoSectionError, + configparser.NoOptionError, + ) as err: + if isinstance(err, (EnvironmentError, configparser.NoSectionError)): +>>>>>>> 4e967422 (fixing tests) print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) @@ -2253,7 +2359,10 @@ def do_setup() -> int: ) ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") +<<<<<<< HEAD maybe_ipy: Optional[str] = ipy +======= +>>>>>>> 4e967422 (fixing tests) if os.path.exists(ipy): try: with open(ipy, "r") as f: @@ -2274,7 +2383,45 @@ def do_setup() -> int: print(" %s unmodified" % ipy) else: print(" %s doesn't exist, ok" % ipy) +<<<<<<< HEAD maybe_ipy = None +======= + ipy = None + + # Make sure both the top-level "versioneer.py" and versionfile_source + # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so + # they'll be copied into source distributions. Pip won't be able to + # install the package without this. + manifest_in = os.path.join(root, "MANIFEST.in") + simple_includes = set() + try: + with open(manifest_in, "r") as f: + for line in f: + if line.startswith("include "): + for include in line.split()[1:]: + simple_includes.add(include) + except EnvironmentError: + pass + # That doesn't cover everything MANIFEST.in can do + # (http://docs.python.org/2/distutils/sourcedist.html#commands), so + # it might give some false negatives. Appending redundant 'include' + # lines is safe, though. + if "versioneer.py" not in simple_includes: + print(" appending 'versioneer.py' to MANIFEST.in") + with open(manifest_in, "a") as f: + f.write("include versioneer.py\n") + else: + print(" 'versioneer.py' already in MANIFEST.in") + if cfg.versionfile_source not in simple_includes: + print( + " appending versionfile_source ('%s') to MANIFEST.in" + % cfg.versionfile_source + ) + with open(manifest_in, "a") as f: + f.write("include %s\n" % cfg.versionfile_source) + else: + print(" versionfile_source already in MANIFEST.in") +>>>>>>> 4e967422 (fixing tests) # Make VCS-specific changes. For git, this means creating/changing # .gitattributes to mark _version.py for export-subst keyword From c38eccec39018c37834c8aa8bb81bf22678fbdc9 Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Wed, 15 May 2024 07:47:19 -0500 Subject: [PATCH 03/19] removing additional parameter from try_cast_to_pandas Signed-off-by: arunjose696 --- .../algebra/default2pandas/binary.py | 1 - .../pandas/small_query_compiler.py | 21 +- modin/utils.py | 22 +- versioneer.py | 261 +++++++++++------- 4 files changed, 177 insertions(+), 128 deletions(-) diff --git a/modin/core/dataframe/algebra/default2pandas/binary.py b/modin/core/dataframe/algebra/default2pandas/binary.py index b834e948c8c..a6e89c4d760 100644 --- a/modin/core/dataframe/algebra/default2pandas/binary.py +++ b/modin/core/dataframe/algebra/default2pandas/binary.py @@ -47,7 +47,6 @@ def bin_ops_wrapper(df, other, *args, **kwargs): "squeeze_other", False ) squeeze_self = kwargs.pop("squeeze_self", False) - if squeeze_other: other = other.squeeze(axis=1) diff --git a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py index 7d5c6be6f00..bc60108b060 100644 --- a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py +++ b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py @@ -357,7 +357,15 @@ def binary_operator(df, other, **kwargs): if squeeze_self: df = df.squeeze(axis=1) - return getattr(df, op)(other, **kwargs) + result = getattr(df, op)(other, **kwargs) + if ( + not isinstance(result, pandas.Series) + and not isinstance(result, pandas.DataFrame) + and is_list_like(result) + ): + result = pandas.DataFrame(result) + + return result return binary_operator @@ -569,12 +577,6 @@ def caller(query_compiler, *args, **kwargs): df = df.copy() if is_series: df = df.squeeze(axis=1) - elif ( - squeeze_series - and len(df.columns) == 1 - and df.columns[0] == MODIN_UNNAMED_SERIES_LABEL - ): - df = df.squeeze(axis=1) exclude_names = [ # "broadcast", "fold_axis", @@ -585,8 +587,8 @@ def caller(query_compiler, *args, **kwargs): kwargs = kwargs.copy() for name in exclude_names: kwargs.pop(name, None) - args = try_cast_to_pandas(args, squeeze=squeeze_args, squeeze_df=True) - kwargs = try_cast_to_pandas(kwargs, squeeze=squeeze_kwargs, squeeze_df=True) + args = try_cast_to_pandas(args, squeeze=squeeze_args) + kwargs = try_cast_to_pandas(kwargs, squeeze=squeeze_kwargs) result = func(df, *args, **kwargs) if in_place: result = df @@ -596,6 +598,7 @@ def caller(query_compiler, *args, **kwargs): if result.name is None: result.name = MODIN_UNNAMED_SERIES_LABEL result = result.to_frame() + return query_compiler.__constructor__(result) return caller diff --git a/modin/utils.py b/modin/utils.py index 32c7b7a9a5a..34071be132b 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -651,9 +651,7 @@ def hashable(obj: bool) -> bool: return True -def try_cast_to_pandas( - obj: Any, squeeze: bool = False, squeeze_df: bool = False -) -> Any: +def try_cast_to_pandas(obj: Any, squeeze: bool = False) -> Any: """ Convert `obj` and all nested objects from Modin to pandas if it is possible. @@ -665,8 +663,6 @@ def try_cast_to_pandas( Object to convert from Modin to pandas. squeeze : bool, default: False Squeeze the converted object(s) before returning them. - squeeze_df : bool, default: False - Squeeze the converted DataFrame(s) if Series-like. Returns ------- @@ -677,13 +673,6 @@ def try_cast_to_pandas( result = obj.modin.to_pandas() if hasattr(obj, "modin") else obj.to_pandas() if squeeze: result = result.squeeze(axis=1) - if ( - squeeze_df - and isinstance(result, pandas.DataFrame) - and len(result.columns) == 1 - and result.columns[0] == MODIN_UNNAMED_SERIES_LABEL - ): - result = result.squeeze(axis=1) # QueryCompiler/low-level ModinFrame case, it doesn't have logic about convertion to Series if ( @@ -693,14 +682,9 @@ def try_cast_to_pandas( result.name = None return result if isinstance(obj, (list, tuple)): - return type(obj)( - [try_cast_to_pandas(o, squeeze=squeeze, squeeze_df=squeeze_df) for o in obj] - ) + return type(obj)([try_cast_to_pandas(o, squeeze=squeeze) for o in obj]) if isinstance(obj, dict): - return { - k: try_cast_to_pandas(v, squeeze=squeeze, squeeze_df=squeeze_df) - for k, v in obj.items() - } + return {k: try_cast_to_pandas(v, squeeze=squeeze) for k, v in obj.items()} if callable(obj): module_hierarchy = getattr(obj, "__module__", "").split(".") fn_name = getattr(obj, "__name__", None) diff --git a/versioneer.py b/versioneer.py index dabe8ac1dff..71109f05c02 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1,6 +1,10 @@ <<<<<<< HEAD +<<<<<<< HEAD # Version: 0.29 ======= +======= + +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) # Version: 0.18 >>>>>>> 4e967422 (fixing tests) @@ -372,6 +376,7 @@ def get_root() -> str: ): ======= if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): +<<<<<<< HEAD >>>>>>> 4e967422 (fixing tests) err = ( "Versioneer was unable to run the project root directory. " @@ -380,6 +385,13 @@ def get_root() -> str: "or in a way that lets it use sys.argv[0] to find the root " "(like 'python path/to/setup.py COMMAND')." ) +======= + err = ("Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND').") +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools @@ -398,11 +410,16 @@ def get_root() -> str: % (os.path.dirname(my_path), versioneer_py) ======= if me_dir != vsr_dir: +<<<<<<< HEAD print( "Warning: build in %s is using versioneer.py from %s" % (os.path.dirname(me), versioneer_py) >>>>>>> 4e967422 (fixing tests) ) +======= + print("Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py)) +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) except NameError: pass return root @@ -444,8 +461,11 @@ def get(parser, name): if parser.has_option("versioneer", name): return parser.get("versioneer", name) return None +<<<<<<< HEAD >>>>>>> 4e967422 (fixing tests) +======= +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) cfg = VersioneerConfig() cfg.VCS = section["VCS"] cfg.style = section.get("style", "") @@ -481,16 +501,15 @@ def decorate(f: Callable) -> Callable: ======= def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" - def decorate(f): >>>>>>> 4e967422 (fixing tests) """Store f in HANDLERS[vcs][method].""" HANDLERS.setdefault(vcs, {})[method] = f return f - return decorate +<<<<<<< HEAD <<<<<<< HEAD def run_command( commands: List[str], @@ -503,6 +522,10 @@ def run_command( ======= def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): >>>>>>> 4e967422 (fixing tests) +======= +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) """Call the given command(s).""" assert isinstance(commands, list) process = None @@ -518,6 +541,7 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env= try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git +<<<<<<< HEAD <<<<<<< HEAD process = subprocess.Popen( [command] + args, @@ -534,6 +558,12 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env= ======= >>>>>>> 4e967422 (fixing tests) ) +======= + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) break except OSError as e: if e.errno == errno.ENOENT: @@ -555,6 +585,7 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env= return stdout, process.returncode +<<<<<<< HEAD LONG_VERSION_PY[ "git" <<<<<<< HEAD @@ -562,6 +593,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env= ======= ] = ''' >>>>>>> 4e967422 (fixing tests) +======= +LONG_VERSION_PY['git'] = ''' +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -1306,11 +1340,15 @@ def git_versions_from_keywords( # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " +<<<<<<< HEAD <<<<<<< HEAD tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} ======= tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) >>>>>>> 4e967422 (fixing tests) +======= + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -1319,11 +1357,15 @@ def git_versions_from_keywords( # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". +<<<<<<< HEAD <<<<<<< HEAD tags = {r for r in refs if re.search(r"\d", r)} ======= tags = set([r for r in refs if re.search(r"\d", r)]) >>>>>>> 4e967422 (fixing tests) +======= + tags = set([r for r in refs if re.search(r'\d', r)]) +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -1331,6 +1373,7 @@ def git_versions_from_keywords( for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): +<<<<<<< HEAD r = ref[len(tag_prefix) :] <<<<<<< HEAD # Filter out refs that exactly match prefix or that don't start @@ -1340,25 +1383,21 @@ def git_versions_from_keywords( continue ======= >>>>>>> 4e967422 (fixing tests) +======= + r = ref[len(tag_prefix):] +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) if verbose: print("picking %s" % r) - return { - "version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, - "error": None, - "date": date, - } + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return { - "version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, - "error": "no suitable tags", - "date": None, - } + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") @@ -1375,6 +1414,7 @@ def git_pieces_from_vcs( if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] +<<<<<<< HEAD <<<<<<< HEAD # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, @@ -1387,6 +1427,10 @@ def git_pieces_from_vcs( ======= out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) >>>>>>> 4e967422 (fixing tests) +======= + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -1394,6 +1438,7 @@ def git_pieces_from_vcs( # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) +<<<<<<< HEAD <<<<<<< HEAD describe_out, rc = runner( ======= @@ -1415,6 +1460,12 @@ def git_pieces_from_vcs( ], cwd=root, ) +======= + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -1469,20 +1520,25 @@ def git_pieces_from_vcs( dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[: git_describe.rindex("-dirty")] + git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: <<<<<<< HEAD # unparsable. Maybe git-describe is misbehaving? ======= # unparseable. Maybe git-describe is misbehaving? +<<<<<<< HEAD >>>>>>> 4e967422 (fixing tests) pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out +======= + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) return pieces # tag @@ -1491,12 +1547,10 @@ def git_pieces_from_vcs( if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( - full_tag, - tag_prefix, - ) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix) :] + pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -1507,6 +1561,7 @@ def git_pieces_from_vcs( else: # HEX: no tags pieces["closest-tag"] = None +<<<<<<< HEAD <<<<<<< HEAD out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) pieces["distance"] = len(out.split()) # total number of commits @@ -1525,6 +1580,15 @@ def git_pieces_from_vcs( 0 ].strip() >>>>>>> 4e967422 (fixing tests) +======= + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], + cwd=root)[0].strip() +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces @@ -1584,6 +1648,7 @@ def versions_from_parentdir( for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): +<<<<<<< HEAD return { "version": dirname[len(parentdir_prefix) :], "full-revisionid": None, @@ -1595,16 +1660,19 @@ def versions_from_parentdir( rootdirs.append(root) root = os.path.dirname(root) # up a level ======= +======= + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) else: rootdirs.append(root) root = os.path.dirname(root) # up a level >>>>>>> 4e967422 (fixing tests) if verbose: - print( - "Tried directories %s but none started with prefix %s" - % (str(rootdirs), parentdir_prefix) - ) + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -1633,13 +1701,11 @@ def versions_from_file(filename: str) -> Dict[str, Any]: contents = f.read() except OSError: raise NotThisMethod("unable to read _version.py") - mo = re.search( - r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S - ) + mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) if not mo: - mo = re.search( - r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S - ) + mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) @@ -1650,8 +1716,13 @@ def write_to_version_file(filename: str, versions: Dict[str, Any]) -> None: <<<<<<< HEAD ======= os.unlink(filename) +<<<<<<< HEAD >>>>>>> 4e967422 (fixing tests) contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) +======= + contents = json.dumps(versions, sort_keys=True, + indent=1, separators=(",", ": ")) +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) @@ -1683,7 +1754,8 @@ def render_pep440(pieces: Dict[str, Any]) -> str: rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -1873,13 +1945,11 @@ def render_git_describe_long(pieces: Dict[str, Any]) -> str: def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: - return { - "version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None, - } + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} if not style or style == "default": style = "pep440" # the default @@ -1903,13 +1973,9 @@ def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: else: raise ValueError("unknown style '%s'" % style) - return { - "version": rendered, - "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], - "error": None, - "date": pieces.get("date"), - } + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} class VersioneerBadRootError(Exception): @@ -1935,10 +2001,15 @@ def get_versions(verbose: bool = False) -> Dict[str, Any]: verbose = verbose or bool(cfg.verbose) # `bool()` used to avoid `None` ======= verbose = verbose or cfg.verbose +<<<<<<< HEAD >>>>>>> 4e967422 (fixing tests) assert ( cfg.versionfile_source is not None ), "please set versioneer.versionfile_source" +======= + assert cfg.versionfile_source is not None, \ + "please set versioneer.versionfile_source" +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) @@ -1992,13 +2063,9 @@ def get_versions(verbose: bool = False) -> Dict[str, Any]: if verbose: print("unable to compute version") - return { - "version": "0+unknown", - "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", - "date": None, - } + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, "error": "unable to compute version", + "date": None} def get_version() -> str: @@ -2051,7 +2118,6 @@ def run(self) -> None: print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) - cmds["version"] = cmd_version # we override "build_py" in setuptools @@ -2091,10 +2157,10 @@ def run(self) -> None: # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: - target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) + target_versionfile = os.path.join(self.build_lib, + cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) - cmds["build_py"] = cmd_build_py if "build_ext" in cmds: @@ -2154,21 +2220,17 @@ def run(self) -> None: os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write( - LONG - % { - "DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - } - ) - + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) cmds["build_exe"] = cmd_build_exe del cmds["build_py"] - if "py2exe" in sys.modules: # py2exe enabled? + if 'py2exe' in sys.modules: # py2exe enabled? try: from py2exe.setuptools_buildexe import py2exe as _py2exe # type: ignore except ImportError: @@ -2187,17 +2249,13 @@ def run(self) -> None: os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write( - LONG - % { - "DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - } - ) - + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) cmds["py2exe"] = cmd_py2exe # sdist farms its file list building out to egg_info @@ -2264,10 +2322,8 @@ def make_release_tree(self, base_dir: str, files: List[str]) -> None: # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) - write_to_version_file( - target_versionfile, self._versioneer_generated_versions - ) - + write_to_version_file(target_versionfile, + self._versioneer_generated_versions) cmds["sdist"] = cmd_sdist return cmds @@ -2327,6 +2383,7 @@ def do_setup() -> int: root = get_root() try: cfg = get_config_from_root(root) +<<<<<<< HEAD <<<<<<< HEAD except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: if isinstance(e, (OSError, configparser.NoSectionError)): @@ -2339,6 +2396,13 @@ def do_setup() -> int: if isinstance(err, (EnvironmentError, configparser.NoSectionError)): >>>>>>> 4e967422 (fixing tests) print("Adding sample versioneer config to setup.cfg", file=sys.stderr) +======= + except (EnvironmentError, configparser.NoSectionError, + configparser.NoOptionError) as err: + if isinstance(err, (EnvironmentError, configparser.NoSectionError)): + print("Adding sample versioneer config to setup.cfg", + file=sys.stderr) +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) @@ -2347,22 +2411,23 @@ def do_setup() -> int: print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write( - LONG - % { - "DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - } - ) + f.write(LONG % {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) +<<<<<<< HEAD ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") <<<<<<< HEAD maybe_ipy: Optional[str] = ipy ======= >>>>>>> 4e967422 (fixing tests) +======= + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), + "__init__.py") +>>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) if os.path.exists(ipy): try: with open(ipy, "r") as f: @@ -2413,10 +2478,8 @@ def do_setup() -> int: else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: - print( - " appending versionfile_source ('%s') to MANIFEST.in" - % cfg.versionfile_source - ) + print(" appending versionfile_source ('%s') to MANIFEST.in" % + cfg.versionfile_source) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: From 5c0141ae92239a95ba87887b20aa0393f4682ffc Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Thu, 16 May 2024 04:25:16 -0500 Subject: [PATCH 04/19] test_iter passing --- .../pandas/small_query_compiler.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py index bc60108b060..5243f2f5b79 100644 --- a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py +++ b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py @@ -1255,6 +1255,17 @@ def has_multiindex(self, axis=0): assert axis == 1 return isinstance(self._pandas_frame.columns, pandas.MultiIndex) + def isin(self, values, ignore_indices=False, **kwargs): + if isinstance(values, type(self)) and ignore_indices: + # Pandas logic is that it ignores indexing if 'values' is a 1D object + values = values.to_pandas().squeeze(axis=1) + if self._shape_hint == "column": + return _register_default_pandas(pandas.Series.isin, is_series=True)(self, values, **kwargs) + else: + return _register_default_pandas(pandas.DataFrame.isin)( + self, values, **kwargs + ) + def to_pandas(self): return self._pandas_frame @@ -1297,14 +1308,6 @@ def getitem_column_array(self, key, numeric=False): return self.__constructor__(self._pandas_frame.iloc[:, key]) return self.__constructor__(self._pandas_frame.loc[:, key]) - def columnarize(self): - if len(self._pandas_frame.columns) != 1 or ( - len(self._pandas_frame.index) == 1 - and self._pandas_frame.index[0] == MODIN_UNNAMED_SERIES_LABEL - ): - return SmallQueryCompiler(self._pandas_frame.transpose()) - return self - def is_series_like(self): return ( len(self._pandas_frame.columns) == 1 or len(self._pandas_frame.index) == 1 From e468d7e8ff6e255ca3f1f1020cd0e1097b940d0a Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Thu, 16 May 2024 06:43:27 -0500 Subject: [PATCH 05/19] fixing isin unique and clip Signed-off-by: arunjose696 --- .../pandas/small_query_compiler.py | 39 +++++++------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py index 5243f2f5b79..8112fbd11f4 100644 --- a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py +++ b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py @@ -752,7 +752,6 @@ def setitem_bool(self, row_loc, col_loc, item): astype = _register_default_pandas(pandas.DataFrame.astype) case_when = _register_default_pandas(pandas.Series.case_when) cat_codes = _register_default_pandas(lambda ser: ser.cat.codes, is_series=True) - clip = _register_default_pandas(pandas.DataFrame.clip) combine = _register_default_pandas(_combine, squeeze_series=True) combine_first = _register_default_pandas( lambda df, other: df.combine_first(other), squeeze_series=True @@ -932,7 +931,6 @@ def setitem_bool(self, row_loc, col_loc, item): is_monotonic_increasing = _register_default_pandas( _is_monotonic("is_monotonic_increasing"), is_series=True ) - isin = _register_default_pandas(pandas.DataFrame.isin) isna = _register_default_pandas(pandas.DataFrame.isna) join = _register_default_pandas(pandas.DataFrame.join) kurt = _register_default_pandas(pandas.DataFrame.kurt) @@ -1114,7 +1112,6 @@ def setitem_bool(self, row_loc, col_loc, item): ) transpose = _register_default_pandas(pandas.DataFrame.transpose) truediv = _register_default_pandas(_register_binary("truediv"), squeeze_series=True) - unique = _register_default_pandas(pandas.Series.unique, is_series=True) unstack = _register_default_pandas(pandas.DataFrame.unstack) var = _register_default_pandas(pandas.DataFrame.var) where = _register_default_pandas(pandas.DataFrame.where) @@ -1126,25 +1123,17 @@ def setitem_bool(self, row_loc, col_loc, item): T = property(transpose) - _add_prefix_df = _register_default_pandas(pandas.DataFrame.add_prefix) - _add_prefix_series = _register_default_pandas( - pandas.Series.add_prefix, is_series=True - ) - - def add_prefix(self, prefix, axis=1): - if axis: - return self._add_prefix_df(prefix=prefix) - return self._add_prefix_series(prefix=prefix) - - _add_suffix_df = _register_default_pandas(pandas.DataFrame.add_suffix) - _add_suffix_series = _register_default_pandas( - pandas.Series.add_suffix, is_series=True - ) + add_prefix = _register_default_pandas(pandas.DataFrame.add_prefix) + add_suffix = _register_default_pandas(pandas.DataFrame.add_suffix) - def add_suffix(self, suffix, axis=1): - if axis: - return self._add_suffix_df(suffix=suffix) - return self._add_suffix_series(suffix=suffix) + def clip(self, lower, upper, **kwargs): + if isinstance(lower, BaseQueryCompiler): + lower = lower.to_pandas().squeeze(1) + if isinstance(upper, BaseQueryCompiler): + upper = upper.to_pandas().squeeze(1) + return _register_default_pandas(pandas.DataFrame.clip)( + self, lower, upper, **kwargs + ) def dot(self, other, squeeze_self=None, squeeze_other=None): other = try_cast_to_pandas(other) @@ -1260,12 +1249,14 @@ def isin(self, values, ignore_indices=False, **kwargs): # Pandas logic is that it ignores indexing if 'values' is a 1D object values = values.to_pandas().squeeze(axis=1) if self._shape_hint == "column": - return _register_default_pandas(pandas.Series.isin, is_series=True)(self, values, **kwargs) + return _register_default_pandas(pandas.Series.isin, is_series=True)( + self, values, **kwargs + ) else: return _register_default_pandas(pandas.DataFrame.isin)( self, values, **kwargs ) - + def to_pandas(self): return self._pandas_frame @@ -1303,7 +1294,7 @@ def from_dataframe(cls, df, data_cls): def dtypes(self): return self._pandas_frame.dtypes - def getitem_column_array(self, key, numeric=False): + def getitem_column_array(self, key, numeric=False, ignore_order=False): if numeric: return self.__constructor__(self._pandas_frame.iloc[:, key]) return self.__constructor__(self._pandas_frame.loc[:, key]) From b2a1ca1257f91d8ccae544345358313df6d49642 Mon Sep 17 00:00:00 2001 From: "Igoshev, Iaroslav" Date: Thu, 16 May 2024 15:06:32 +0000 Subject: [PATCH 06/19] Enable test_default.py and test_join_sort.py Signed-off-by: Igoshev, Iaroslav --- .github/workflows/ci.yml | 2 +- modin/config/__init__.py | 6 +- modin/config/envvars.py | 6 +- .../pandas/small_query_compiler.py | 142 ++++-------------- modin/pandas/base.py | 4 +- modin/pandas/dataframe.py | 8 +- modin/pandas/io.py | 9 +- modin/pandas/series.py | 8 +- modin/tests/pandas/dataframe/test_binary.py | 6 +- modin/tests/pandas/dataframe/test_default.py | 36 ++--- modin/tests/pandas/dataframe/test_indexing.py | 10 +- .../tests/pandas/dataframe/test_join_sort.py | 15 +- modin/tests/pandas/test_expanding.py | 6 +- setup.cfg | 2 +- 14 files changed, 94 insertions(+), 166 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c5e0b02b460..cce78ec2a64 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -646,7 +646,7 @@ jobs: matrix: python-version: ["3.9"] env: - MODIN_SMALL_QUERY_COMPILER: "True" + MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER: "True" name: test-small-query-compiler python ${{matrix.python-version}}) steps: - uses: actions/checkout@v4 diff --git a/modin/config/__init__.py b/modin/config/__init__.py index d81655b9dbd..a204b373e01 100644 --- a/modin/config/__init__.py +++ b/modin/config/__init__.py @@ -27,7 +27,6 @@ EnvironmentVariable, GithubCI, GpuCount, - InitializeWithSmallQueryCompilers, IsDebug, IsExperimental, IsRayCluster, @@ -54,7 +53,8 @@ TestReadFromPostgres, TestReadFromSqlServer, TrackFileLeaks, - + UsePlainPandasQueryCompiler, + ) from modin.config.pubsub import Parameter, ValueSource, context @@ -70,7 +70,7 @@ "CpuCount", "GpuCount", "Memory", - "InitializeWithSmallQueryCompilers", + "UsePlainPandasQueryCompiler", # Ray specific "IsRayCluster", "RayRedisAddress", diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 43bf5925471..3010dc28bdc 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -913,10 +913,10 @@ def _check_vars() -> None: ) -class InitializeWithSmallQueryCompilers(EnvironmentVariable, type=str): - """Set to true to use implementation of SmallQueryCompiler.""" +class UsePlainPandasQueryCompiler(EnvironmentVariable, type=bool): + """Set to true to use implementation of PlainPandasQueryCompiler.""" - varname = "MODIN_SMALL_QUERY_COMPILER" + varname = "MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER" default = False diff --git a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py index 8112fbd11f4..d6767478926 100644 --- a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py +++ b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py @@ -12,21 +12,18 @@ # governing permissions and limitations under the License. """ -Module contains ``SmallQueryCompiler`` class. +Module contains ``PlainPandasQueryCompiler`` class. -``SmallQueryCompiler`` is responsible for compiling efficient DataFrame algebra +``PlainPandasQueryCompiler`` is responsible for compiling efficient DataFrame algebra queries for small data and empty ``PandasDataFrame``. """ -import warnings - import numpy as np import pandas from pandas.core.dtypes.common import is_list_like, is_scalar -from modin.config.envvars import InitializeWithSmallQueryCompilers +from modin.config.envvars import UsePlainPandasQueryCompiler from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler -from modin.error_message import ErrorMessage from modin.utils import ( MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings, @@ -48,9 +45,9 @@ def _get_axis(axis): callable(PandasQueryCompiler) -> pandas.Index """ if axis == 0: - return lambda self: self._pandas_frame.index + return lambda self: self._modin_frame.index else: - return lambda self: self._pandas_frame.columns + return lambda self: self._modin_frame.columns def _set_axis(axis): @@ -69,12 +66,12 @@ def _set_axis(axis): if axis == 0: def set_axis(self, idx): - self._pandas_frame.index = idx + self._modin_frame.index = idx else: def set_axis(self, cols): - self._pandas_frame.columns = cols + self._modin_frame.columns = cols return set_axis @@ -572,7 +569,7 @@ def _register_default_pandas( """ def caller(query_compiler, *args, **kwargs): - df = query_compiler._pandas_frame + df = query_compiler._modin_frame if df_copy: df = df.copy() if is_series: @@ -605,21 +602,22 @@ def caller(query_compiler, *args, **kwargs): @_inherit_docstrings(BaseQueryCompiler) -class SmallQueryCompiler(BaseQueryCompiler): +class PlainPandasQueryCompiler(BaseQueryCompiler): """ Query compiler for the pandas storage format. - This class translates common query compiler API to default all methods - to pandas. + This class translates common query compiler API into + plain pandas to execute operations on small data + depending on the threshold. Parameters ---------- pandas_frame : pandas.DataFrame - Modin Frame to query with the compiled queries. + Pandas frame to query with the compiled queries. """ def __init__(self, pandas_frame): - assert InitializeWithSmallQueryCompilers.get() + assert UsePlainPandasQueryCompiler.get() if hasattr(pandas_frame, "_to_pandas"): pandas_frame = pandas_frame._to_pandas() if is_scalar(pandas_frame): @@ -627,99 +625,23 @@ def __init__(self, pandas_frame): elif not isinstance(pandas_frame, pandas.DataFrame): pandas_frame = pandas.DataFrame(pandas_frame) - self._pandas_frame = pandas_frame - - # def default_to_pandas(self, pandas_op, *args, **kwargs): - # args = (a.to_pandas() if isinstance(a, type(self)) else a for a in args) - # kwargs = { - # k: v.to_pandas if isinstance(v, type(self)) else v - # for k, v in kwargs.items() - # } - # op_name = getattr(pandas_op, "__name__", str(pandas_op)) - # ErrorMessage.default_to_pandas(op_name) - - # result = pandas_op(self._pandas_frame, *args, **kwargs) - # if isinstance(result, pandas.Series): - # if result.name is None: - # result.name = MODIN_UNNAMED_SERIES_LABEL - # result = result.to_frame() - - # return result - - def default_to_pandas(self, pandas_op, *args, **kwargs): - """ - Do fallback to pandas for the passed function. - - Parameters - ---------- - pandas_op : callable(pandas.DataFrame) -> object - Function to apply to the casted to pandas frame. - *args : iterable - Positional arguments to pass to `pandas_op`. - **kwargs : dict - Key-value arguments to pass to `pandas_op`. - - Returns - ------- - BaseQueryCompiler - The result of the `pandas_op`, converted back to ``BaseQueryCompiler``. - """ - op_name = getattr(pandas_op, "__name__", str(pandas_op)) - ErrorMessage.default_to_pandas(op_name) - args = try_cast_to_pandas(args) - kwargs = try_cast_to_pandas(kwargs) - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=FutureWarning) - result = pandas_op(try_cast_to_pandas(self), *args, **kwargs) - if isinstance(result, (tuple, list)): - if "Series.tolist" in pandas_op.__name__: - # fast path: no need to iterate over the result from `tolist` function - return result - return [self.__wrap_in_qc(obj) for obj in result] - # breakpoint() - return type(self)(result) - - def __wrap_in_qc(self, obj): - """ - Wrap `obj` in query compiler. - - Parameters - ---------- - obj : any - Object to wrap. - - Returns - ------- - BaseQueryCompiler - Query compiler wrapping the object. - """ - if isinstance(obj, pandas.Series): - if obj.name is None: - obj.name = MODIN_UNNAMED_SERIES_LABEL - obj = obj.to_frame() - if isinstance(obj, pandas.DataFrame): - return self.from_pandas(obj, type(self._pandas_frame)) - else: - return obj + self._modin_frame = pandas_frame def execute(self): - """Wait for all computations to complete without materializing data.""" pass def take_2d_positional(self, index=None, columns=None): index = slice(None) if index is None else index columns = slice(None) if columns is None else columns - self._pandas_frame.iloc[index, columns] - return self.__constructor__(self._pandas_frame.iloc[index, columns]) + return self.__constructor__(self._modin_frame.iloc[index, columns]) def copy(self): - return self.__constructor__(self._pandas_frame.copy()) + return self.__constructor__(self._modin_frame.copy()) def setitem_bool(self, row_loc, col_loc, item): - self._pandas_frame.loc[row_loc._pandas_frame.squeeze(axis=1), col_loc] = item - return self.__constructor__(self._pandas_frame) + self._modin_frame.loc[row_loc._modin_frame.squeeze(axis=1), col_loc] = item + return self.__constructor__(self._modin_frame) __and__ = _register_default_pandas(pandas.DataFrame.__and__, squeeze_series=True) __dir__ = _register_default_pandas(pandas.DataFrame.__dir__) @@ -943,7 +865,7 @@ def setitem_bool(self, row_loc, col_loc, item): mask = _register_default_pandas(pandas.DataFrame.mask) max = _register_default_pandas(pandas.DataFrame.max) map = _register_default_pandas(pandas.DataFrame.map) - mean = _register_default_pandas(pandas.DataFrame.mean) + mean = _register_default_pandas(pandas.DataFrame.mean, return_modin=False) median = _register_default_pandas(pandas.DataFrame.median) melt = _register_default_pandas(pandas.DataFrame.melt) memory_usage = _register_default_pandas(pandas.DataFrame.memory_usage) @@ -1140,9 +1062,9 @@ def dot(self, other, squeeze_self=None, squeeze_other=None): if squeeze_other: other = other.squeeze() if squeeze_self: - result = self._pandas_frame.squeeze(axis=1).dot(other) + result = self._modin_frame.squeeze(axis=1).dot(other) else: - result = self._pandas_frame.dot(other) + result = self._modin_frame.dot(other) if isinstance(result, pandas.Series): if result.name is None: result.name = "__reduced__" @@ -1227,7 +1149,7 @@ def expanding_corr( ) def get_axis(self, axis): - return self._pandas_frame.index if axis == 0 else self._pandas_frame.columns + return self._modin_frame.index if axis == 0 else self._modin_frame.columns def get_index_name(self, axis=0): return self.get_axis(axis).name @@ -1240,9 +1162,9 @@ def set_index_name(self, name, axis=0): def has_multiindex(self, axis=0): if axis == 0: - return isinstance(self._pandas_frame.index, pandas.MultiIndex) + return isinstance(self._modin_frame.index, pandas.MultiIndex) assert axis == 1 - return isinstance(self._pandas_frame.columns, pandas.MultiIndex) + return isinstance(self._modin_frame.columns, pandas.MultiIndex) def isin(self, values, ignore_indices=False, **kwargs): if isinstance(values, type(self)) and ignore_indices: @@ -1258,7 +1180,7 @@ def isin(self, values, ignore_indices=False, **kwargs): ) def to_pandas(self): - return self._pandas_frame + return self._modin_frame @classmethod def from_pandas(cls, df, data_cls): @@ -1277,7 +1199,7 @@ def finalize(self): # Dataframe exchange protocol def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): - return self._pandas_frame.__dataframe__( + return self._modin_frame.__dataframe__( nan_as_null=nan_as_null, allow_copy=allow_copy ) @@ -1292,14 +1214,12 @@ def from_dataframe(cls, df, data_cls): @property def dtypes(self): - return self._pandas_frame.dtypes + return self._modin_frame.dtypes def getitem_column_array(self, key, numeric=False, ignore_order=False): if numeric: - return self.__constructor__(self._pandas_frame.iloc[:, key]) - return self.__constructor__(self._pandas_frame.loc[:, key]) + return self.__constructor__(self._modin_frame.iloc[:, key]) + return self.__constructor__(self._modin_frame.loc[:, key]) def is_series_like(self): - return ( - len(self._pandas_frame.columns) == 1 or len(self._pandas_frame.index) == 1 - ) + return len(self._modin_frame.columns) == 1 or len(self._modin_frame.index) == 1 diff --git a/modin/pandas/base.py b/modin/pandas/base.py index bc9e89ba2bb..1be5c83390d 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -67,7 +67,7 @@ from modin import pandas as pd from modin.error_message import ErrorMessage from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - SmallQueryCompiler, + PlainPandasQueryCompiler, ) from modin.logging import ClassLogger, disable_logging from modin.pandas.accessor import CachedAccessor, ModinAPI @@ -286,7 +286,7 @@ def _build_repr_df( indexer = row_indexer, _get_repr_axis_label_indexer(self.columns, num_cols) else: indexer = row_indexer - if isinstance(self._query_compiler, SmallQueryCompiler): + if isinstance(self._query_compiler, PlainPandasQueryCompiler): return self._query_compiler.to_pandas().iloc[indexer] return self.iloc[indexer]._query_compiler.to_pandas() diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 818ea4238d9..0acc0b1ccdc 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -57,10 +57,10 @@ from pandas.io.formats.info import DataFrameInfo from pandas.util._validators import validate_bool_kwarg -from modin.config import InitializeWithSmallQueryCompilers, PersistentPickle +from modin.config import PersistentPickle, UsePlainPandasQueryCompiler from modin.error_message import ErrorMessage from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - SmallQueryCompiler, + PlainPandasQueryCompiler, ) from modin.logging import disable_logging from modin.pandas import Categorical @@ -261,11 +261,11 @@ def __init__( else: self._query_compiler = query_compiler - if query_compiler is None and InitializeWithSmallQueryCompilers.get(): + if query_compiler is None and UsePlainPandasQueryCompiler.get(): small_dataframe = pandas.DataFrame( data=data, index=index, columns=columns, dtype=dtype, copy=copy ) - self._query_compiler = SmallQueryCompiler(small_dataframe) + self._query_compiler = PlainPandasQueryCompiler(small_dataframe) def __repr__(self) -> str: """ diff --git a/modin/pandas/io.py b/modin/pandas/io.py index 13bea82a210..615bab25762 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -64,10 +64,11 @@ from pandas.io.parsers import TextFileReader from pandas.io.parsers.readers import _c_parser_defaults -from modin.config import ModinNumpy, InitializeWithSmallQueryCompilers + +from modin.config import ModinNumpy, UsePlainPandasQueryCompiler from modin.error_message import ErrorMessage from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - SmallQueryCompiler, + PlainPandasQueryCompiler, ) from modin.logging import ClassLogger, enable_logging from modin.utils import ( @@ -994,8 +995,8 @@ def from_pandas(df) -> DataFrame: """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher - if InitializeWithSmallQueryCompilers.get(): - return ModinObjects.DataFrame(query_compiler=SmallQueryCompiler(df)) + if UsePlainPandasQueryCompiler.get(): + return ModinObjects.DataFrame(query_compiler=PlainPandasQueryCompiler(df)) return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_pandas(df)) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index ede267bfedb..749cf0f6a50 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -31,9 +31,9 @@ from pandas.io.formats.info import SeriesInfo from pandas.util._validators import validate_bool_kwarg -from modin.config import InitializeWithSmallQueryCompilers, PersistentPickle +from modin.config import PersistentPickle, UsePlainPandasQueryCompiler from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - SmallQueryCompiler, + PlainPandasQueryCompiler, ) from modin.logging import disable_logging from modin.pandas.io import from_pandas, to_pandas @@ -147,8 +147,8 @@ def __init__( name = MODIN_UNNAMED_SERIES_LABEL if isinstance(data, pandas.Series) and data.name is not None: name = data.name - if InitializeWithSmallQueryCompilers.get(): - query_compiler = SmallQueryCompiler( + if UsePlainPandasQueryCompiler.get(): + query_compiler = PlainPandasQueryCompiler( pandas.DataFrame( pandas.Series( data=data, diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index 70cb6b769c0..17351b21839 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -17,7 +17,7 @@ import pytest import modin.pandas as pd -from modin.config import NPartitions, StorageFormat, InitializeWithSmallQueryCompilers +from modin.config import NPartitions, StorageFormat, UsePlainPandasQueryCompiler from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) @@ -211,8 +211,8 @@ def operation(df): reason="Modin on this engine doesn't create virtual partitions.", ) @pytest.mark.skipif( - InitializeWithSmallQueryCompilers.get(), - reason="SmallQueryCompiler does not contain partitions.", + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler does not contain partitions.", ) @pytest.mark.parametrize( "left_virtual,right_virtual", [(True, False), (False, True), (True, True)] diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 87bb1e15bf3..50e11b4068b 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -23,12 +23,7 @@ from numpy.testing import assert_array_equal import modin.pandas as pd -from modin.config import ( - Engine, - InitializeWithSmallQueryCompilers, - NPartitions, - StorageFormat, -) +from modin.config import Engine, NPartitions, StorageFormat, UsePlainPandasQueryCompiler from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( axis_keys, @@ -95,7 +90,7 @@ def test_ops_defaulting_to_pandas(op, make_args): modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"], axis=1) with ( warns_that_defaulting_to_pandas() - if not InitializeWithSmallQueryCompilers.get() + if not UsePlainPandasQueryCompiler.get() else contextlib.nullcontext() ): operation = getattr(modin_df, op) @@ -113,7 +108,7 @@ def test_style(): data = test_data_values[0] with ( warns_that_defaulting_to_pandas() - if not InitializeWithSmallQueryCompilers.get() + if not UsePlainPandasQueryCompiler.get() else contextlib.nullcontext() ): pd.DataFrame(data).style @@ -125,7 +120,7 @@ def test_to_timestamp(): with ( warns_that_defaulting_to_pandas() - if not InitializeWithSmallQueryCompilers.get() + if not UsePlainPandasQueryCompiler.get() else contextlib.nullcontext() ): df.to_period().to_timestamp() @@ -142,8 +137,8 @@ def test_to_numpy(data): @pytest.mark.skipif( - InitializeWithSmallQueryCompilers.get(), - reason="SmallQueryCompiler does not contain partitions.", + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler does not contain partitions.", ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_partition_to_numpy(data): @@ -158,7 +153,7 @@ def test_asfreq(): df = pd.DataFrame({"s": series}) with ( warns_that_defaulting_to_pandas() - if not InitializeWithSmallQueryCompilers.get() + if not UsePlainPandasQueryCompiler.get() else contextlib.nullcontext() ): # We are only testing that this defaults to pandas, so we will just check for @@ -320,7 +315,7 @@ def test_corr_min_periods(self, min_periods): {"a": [1, np.nan, 3, 4, 5, 6], "b": [1, 2, 1, 4, 5, np.nan]} ) modin_df = pd.concat([modin_df.iloc[:3], modin_df.iloc[3:]]) - if not InitializeWithSmallQueryCompilers.get(): + if not UsePlainPandasQueryCompiler.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( modin_df, pandas_df, lambda df: df.corr(min_periods=min_periods) @@ -340,8 +335,8 @@ def test_corr_non_numeric(self, numeric_only): reason="doesn't make sense for non-partitioned executions", ) @pytest.mark.skipif( - InitializeWithSmallQueryCompilers.get(), - reason="SmallQueryCompiler does not contain partitions.", + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler does not contain partitions.", ) def test_corr_nans_in_different_partitions(self): # NaN in the first partition @@ -632,7 +627,13 @@ def test_pivot(data, index, columns, values, request): in request.node.callspec.id or "default-one_column-several_columns_index" in request.node.callspec.id or "default-one_column-one_column_index" in request.node.callspec.id - or (current_execution in ("BaseOnPython",) and index is lib.no_default) + or ( + ( + current_execution in ("BaseOnPython",) + or UsePlainPandasQueryCompiler.get() + ) + and index is lib.no_default + ) ): pytest.xfail(reason="https://github.com/modin-project/modin/issues/7010") @@ -1010,7 +1011,8 @@ def test_resampler_functions_with_arg(rule, axis, method_arg): "DateColumn", marks=pytest.mark.xfail( condition=Engine.get() in ("Ray", "Unidist", "Dask", "Python") - and StorageFormat.get() != "Base", + and StorageFormat.get() != "Base" + and not UsePlainPandasQueryCompiler.get(), reason="https://github.com/modin-project/modin/issues/6399", ), ), diff --git a/modin/tests/pandas/dataframe/test_indexing.py b/modin/tests/pandas/dataframe/test_indexing.py index 5b3b56d1a51..506ac2bb774 100644 --- a/modin/tests/pandas/dataframe/test_indexing.py +++ b/modin/tests/pandas/dataframe/test_indexing.py @@ -21,7 +21,7 @@ from pandas._testing import ensure_clean import modin.pandas as pd -from modin.config import MinRowPartitionSize, NPartitions, InitializeWithSmallQueryCompilers +from modin.config import MinRowPartitionSize, NPartitions, UsePlainPandasQueryCompiler from modin.pandas.indexing import is_range_like from modin.pandas.testing import assert_index_equal from modin.tests.pandas.utils import ( @@ -584,8 +584,8 @@ def test_loc_setting_single_categorical_column(): @pytest.mark.skipif( - InitializeWithSmallQueryCompilers.get(), - reason="SmallQueryCompiler does not currently support IO functions.", + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler does not currently support IO functions.", ) def test_loc_multi_index(): modin_df = pd.read_csv( @@ -2241,8 +2241,8 @@ def test___setitem__partitions_aligning(): @pytest.mark.skipif( - InitializeWithSmallQueryCompilers.get(), - reason="SmallQueryCompiler does not currently support IO functions.", + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler does not currently support IO functions.", ) def test___setitem__with_mismatched_partitions(): with ensure_clean(".csv") as fname: diff --git a/modin/tests/pandas/dataframe/test_join_sort.py b/modin/tests/pandas/dataframe/test_join_sort.py index 670eb9ff911..8343468b1df 100644 --- a/modin/tests/pandas/dataframe/test_join_sort.py +++ b/modin/tests/pandas/dataframe/test_join_sort.py @@ -11,6 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +import contextlib import warnings import matplotlib @@ -19,7 +20,7 @@ import pytest import modin.pandas as pd -from modin.config import Engine, NPartitions, StorageFormat +from modin.config import Engine, NPartitions, StorageFormat, UsePlainPandasQueryCompiler from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( arg_keys, @@ -610,7 +611,11 @@ def test_sort_multiindex(sort_remaining): setattr(df, index, new_index) for kwargs in [{"level": 0}, {"axis": 0}, {"axis": 1}]: - with warns_that_defaulting_to_pandas(): + with ( + warns_that_defaulting_to_pandas() + if not UsePlainPandasQueryCompiler.get() + else contextlib.nullcontext() + ): df_equals( modin_df.sort_index(sort_remaining=sort_remaining, **kwargs), pandas_df.sort_index(sort_remaining=sort_remaining, **kwargs), @@ -732,7 +737,7 @@ def test_sort_values_descending_with_only_two_bins(): modin_df = pd.concat([part1, part2]) pandas_df = modin_df._to_pandas() - if StorageFormat.get() == "Pandas": + if StorageFormat.get() == "Pandas" and not UsePlainPandasQueryCompiler.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( @@ -772,7 +777,7 @@ def test_sort_values_with_one_partition(ascending): np.array([["hello", "goodbye"], ["hello", "Hello"]]) ) - if StorageFormat.get() == "Pandas": + if StorageFormat.get() == "Pandas" and not UsePlainPandasQueryCompiler.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (1, 1) eval_general( @@ -892,7 +897,7 @@ def test_sort_values_with_only_one_non_na_row_in_partition(ascending, na_positio @pytest.mark.skipif( - Engine.get() not in ("Ray", "Unidist", "Dask"), + Engine.get() not in ("Ray", "Unidist", "Dask") or UsePlainPandasQueryCompiler.get(), reason="We only need to test this case where sort does not default to pandas.", ) def test_sort_values_with_sort_key_on_partition_boundary(): diff --git a/modin/tests/pandas/test_expanding.py b/modin/tests/pandas/test_expanding.py index 85e08595a89..5a962061e47 100644 --- a/modin/tests/pandas/test_expanding.py +++ b/modin/tests/pandas/test_expanding.py @@ -18,7 +18,7 @@ import pytest import modin.pandas as pd -from modin.config import InitializeWithSmallQueryCompilers, NPartitions +from modin.config import NPartitions, UsePlainPandasQueryCompiler from modin.tests.test_utils import warns_that_defaulting_to_pandas from .utils import ( @@ -71,7 +71,7 @@ def test_dataframe(data, min_periods, axis, method, kwargs): def test_dataframe_corr_cov(data, min_periods, axis, method): with ( warns_that_defaulting_to_pandas() - if not InitializeWithSmallQueryCompilers.get() + if not UsePlainPandasQueryCompiler.get() else contextlib.nullcontext() ): eval_general( @@ -87,7 +87,7 @@ def test_dataframe_corr_cov_with_self(method): mdf, pdf = create_test_dfs(test_data["float_nan_data"]) with ( warns_that_defaulting_to_pandas() - if not InitializeWithSmallQueryCompilers.get() + if not UsePlainPandasQueryCompiler.get() else contextlib.nullcontext() ): eval_general( diff --git a/setup.cfg b/setup.cfg index e6878fbaf72..1bf18172e1d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -70,4 +70,4 @@ exclude_lines = pass [pytest] -addopts = --env=MODIN_SMALL_QUERY_COMPILER=True \ No newline at end of file +addopts = --env=MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER=True \ No newline at end of file From b762d61c126bb42977bf2f4ca427bcc0754fca66 Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Thu, 23 May 2024 06:07:56 -0500 Subject: [PATCH 07/19] fixed test_map_metadata by adding set_frame_dtypes_cache and has_materialized_dtypes to query compiler layer as in the code in multiple places the methods of private _modin_frame were used --- .github/workflows/ci.yml | 5 +++-- docs/conf.py | 4 +--- modin/config/__init__.py | 1 - .../storage_formats/pandas/query_compiler.py | 21 ++++++++++++++++++ modin/pandas/dataframe.py | 3 +-- modin/pandas/io.py | 1 - modin/pandas/utils.py | 8 +------ .../pandas/dataframe/test_map_metadata.py | 22 ++++++++++++++++--- setup.cfg | 3 --- 9 files changed, 46 insertions(+), 22 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cce78ec2a64..1108fd6ffa7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -214,6 +214,7 @@ jobs: unidist: ${{ steps.filter.outputs.unidist }} engines: ${{ steps.engines.outputs.engines }} experimental: ${{ steps.experimental.outputs.experimental }} + test-small-query-compiler: ${{ steps.filter.outputs.test-small-query-compiler }} steps: - uses: actions/checkout@v4 - uses: dorny/paths-filter@v3 @@ -636,8 +637,8 @@ jobs: - run: python -m pytest modin/tests/experimental/spreadsheet/test_general.py test-small-query-compiler: - needs: [changes, lint-flake8, lint-black, test-api, test-headers] - if: ${{ needs.changes.outputs.test-small-query-compiler == 'true' }} + needs: [ lint-flake8, execution-filter] + if: ${{ needs.execution-filter.outputs.test-small-query-compiler == 'true' }} runs-on: ubuntu-latest defaults: run: diff --git a/docs/conf.py b/docs/conf.py index 61f49793efb..783eb11e414 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -48,9 +48,7 @@ def noop_decorator(*args, **kwargs): if not hasattr(sys.modules["unidist"].core.base, "object_ref"): sys.modules["unidist"].core.base.object_ref = type("object_ref", (object,), {}) if not hasattr(sys.modules["unidist"].core.base.object_ref, "ObjectRef"): - sys.modules["unidist"].core.base.object_ref.ObjectRef = type( - "ObjectRef", (object,), {} - ) + sys.modules["unidist"].core.base.object_ref.ObjectRef = type("ObjectRef", (object,), {}) sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) import modin diff --git a/modin/config/__init__.py b/modin/config/__init__.py index a204b373e01..b3c7f8f54f0 100644 --- a/modin/config/__init__.py +++ b/modin/config/__init__.py @@ -54,7 +54,6 @@ TestReadFromSqlServer, TrackFileLeaks, UsePlainPandasQueryCompiler, - ) from modin.config.pubsub import Parameter, ValueSource, context diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 7c4f7e79f55..1b1d1e15f20 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -365,6 +365,27 @@ def copy(self): # END Copy + def has_materialized_dtypes(self): + """ + Check if the undelying modin frame has materialized dtypes + + Returns + ------- + bool + True if if the undelying modin frame and False otherwise. + """ + return self.has_materialized_dtypes() + + def set_frame_dtypes_cache(self, dtypes): + """ + Set dtypes cache for the underlying modin frame. + + Parameters + ---------- + dtypes : pandas.Series, ModinDtypes, callable or None + """ + self.set_frame_dtypes_cache(dtypes) + # Append/Concat/Join (Not Merge) # The append/concat/join operations should ideally never trigger remote # compute. These operations should only ever be manipulations of the diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 0acc0b1ccdc..516537c8108 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -155,8 +155,7 @@ def __init__( # use this list to update inplace when there is a shallow copy. self._siblings = [] if isinstance(data, (DataFrame, Series)): - query_compiler = data._query_compiler.copy() - self._query_compiler = query_compiler + self._query_compiler = data._query_compiler.copy() if index is not None and any(i not in data.index for i in index): raise NotImplementedError( "Passing non-existant columns or index values to constructor not" diff --git a/modin/pandas/io.py b/modin/pandas/io.py index 615bab25762..5bb599dd749 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -64,7 +64,6 @@ from pandas.io.parsers import TextFileReader from pandas.io.parsers.readers import _c_parser_defaults - from modin.config import ModinNumpy, UsePlainPandasQueryCompiler from modin.error_message import ErrorMessage from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index 09b19637e88..ef7f199b57c 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -22,13 +22,7 @@ from pandas._typing import AggFuncType, AggFuncTypeBase, AggFuncTypeDict, IndexLabel from pandas.util._decorators import doc - - -from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - SmallQueryCompiler, -) - -from modin.utils import hashable +from modin.utils import hashable _doc_binary_operation = """ Return {operation} of {left} and `{right}` (binary operator `{bin_op}`). diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index 4b19d5fbd9d..bde65a9b845 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -19,7 +19,12 @@ import pytest import modin.pandas as pd -from modin.config import MinRowPartitionSize, NPartitions, StorageFormat +from modin.config import ( + MinRowPartitionSize, + NPartitions, + StorageFormat, + UsePlainPandasQueryCompiler, +) from modin.core.dataframe.pandas.metadata import LazyProxyCategoricalDtype from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas from modin.pandas.testing import assert_index_equal, assert_series_equal @@ -299,7 +304,10 @@ def test_copy(data): assert new_modin_df.columns is not modin_df.columns assert new_modin_df.dtypes is not modin_df.dtypes - if get_current_execution() != "BaseOnPython": + if ( + get_current_execution() != "BaseOnPython" + and not UsePlainPandasQueryCompiler.get() + ): assert np.array_equal( new_modin_df._query_compiler._modin_frame._partitions, modin_df._query_compiler._modin_frame._partitions, @@ -565,6 +573,10 @@ def test_astype_int64_to_astype_category_github_issue_6259(): get_current_execution() == "BaseOnPython", reason="BaseOnPython doesn't have proxy categories", ) +@pytest.mark.skipif( + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler doesn't have proxy categories", +) class TestCategoricalProxyDtype: """This class contains test and test usilities for the ``LazyProxyCategoricalDtype`` class.""" @@ -787,6 +799,10 @@ def comparator(df1, df2): ) +@pytest.mark.skipif( + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler does not contain partitions.", +) def test_convert_dtypes_multiple_row_partitions(): # Column 0 should have string dtype modin_part1 = pd.DataFrame(["a"]).convert_dtypes() @@ -811,7 +827,7 @@ def test_convert_dtypes_5653(): modin_part1 = pd.DataFrame({"col1": ["a", "b", "c", "d"]}) modin_part2 = pd.DataFrame({"col1": [None, None, None, None]}) modin_df = pd.concat([modin_part1, modin_part2]) - if StorageFormat.get() == "Pandas": + if StorageFormat.get() == "Pandas" and not UsePlainPandasQueryCompiler.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) modin_df = modin_df.convert_dtypes() assert len(modin_df.dtypes) == 1 diff --git a/setup.cfg b/setup.cfg index 1bf18172e1d..4296cb97859 100644 --- a/setup.cfg +++ b/setup.cfg @@ -68,6 +68,3 @@ exclude_lines = raise ImportError assert pass - -[pytest] -addopts = --env=MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER=True \ No newline at end of file From b47aee47f41190f8fc790f2f237a5b6ecb6e3543 Mon Sep 17 00:00:00 2001 From: "Igoshev, Iaroslav" Date: Thu, 23 May 2024 15:57:48 +0000 Subject: [PATCH 08/19] Fix test_dot Signed-off-by: Igoshev, Iaroslav --- modin/pandas/dataframe.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 516537c8108..62f85649429 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -260,12 +260,6 @@ def __init__( else: self._query_compiler = query_compiler - if query_compiler is None and UsePlainPandasQueryCompiler.get(): - small_dataframe = pandas.DataFrame( - data=data, index=index, columns=columns, dtype=dtype, copy=copy - ) - self._query_compiler = PlainPandasQueryCompiler(small_dataframe) - def __repr__(self) -> str: """ Return a string representation for a particular ``DataFrame``. From 820b399e083606aa3b1e8c735ef90bd542eefff9 Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Thu, 23 May 2024 14:32:13 -0500 Subject: [PATCH 09/19] test_udf passing --- modin/core/storage_formats/base/query_compiler.py | 11 +++++++++++ modin/core/storage_formats/pandas/query_compiler.py | 10 ++++++++++ modin/pandas/dataframe.py | 5 +---- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 80e89a577a2..cb4ed623b32 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -4617,6 +4617,17 @@ def frame_has_dtypes_cache(self) -> bool: """ return self._modin_frame.has_dtypes_cache + def has_dtypes_cache(self) -> bool: + """ + Check if the dtypes cache exists for the underlying modin frame. + + Returns + ------- + bool + True for base class as dtypes are always present + """ + return True + def get_index_name(self, axis=0): """ Get index name of specified axis. diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 1b1d1e15f20..691c2c7d698 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -386,6 +386,16 @@ def set_frame_dtypes_cache(self, dtypes): """ self.set_frame_dtypes_cache(dtypes) + def has_dtypes_cache(self) -> bool: + """ + Check if the dtypes cache exists for the underlying modin frame. + + Returns + ------- + bool + """ + return self._modin_frame.has_dtypes_cache + # Append/Concat/Join (Not Merge) # The append/concat/join operations should ideally never trigger remote # compute. These operations should only ever be manipulations of the diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 62f85649429..fe28d3680e0 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -57,11 +57,8 @@ from pandas.io.formats.info import DataFrameInfo from pandas.util._validators import validate_bool_kwarg -from modin.config import PersistentPickle, UsePlainPandasQueryCompiler +from modin.config import PersistentPickle from modin.error_message import ErrorMessage -from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - PlainPandasQueryCompiler, -) from modin.logging import disable_logging from modin.pandas import Categorical from modin.pandas.io import from_non_pandas, from_pandas, to_pandas From 6a999aa2137e1c973b4fc47bc8412f3c718cc9a3 Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Thu, 23 May 2024 15:48:39 -0500 Subject: [PATCH 10/19] All tests except one passing in modin/tests/pandas/dataframe --- 629f1c86196011ef9edda4bf0168e181.db | Bin 0 -> 12288 bytes .../algebra/default2pandas/binary.py | 1 + .../storage_formats/pandas/query_compiler.py | 4 +- .../pandas/small_query_compiler.py | 41 ++++++++++++++++-- modin/pandas/series.py | 39 +++++------------ 5 files changed, 51 insertions(+), 34 deletions(-) create mode 100644 629f1c86196011ef9edda4bf0168e181.db diff --git a/629f1c86196011ef9edda4bf0168e181.db b/629f1c86196011ef9edda4bf0168e181.db new file mode 100644 index 0000000000000000000000000000000000000000..f2fd394d3ec40e5d5ec84376497ee34a440c718d GIT binary patch literal 12288 zcmeI#yGq1B6b9g#Op-}rbiA(@guzNh#B1*@$bzn@3)+QMBL-JRH(sz?Eqx20z$fqt zd;x8&1i?xWY%I(yQ@Dkzl_35Ha%O%qnMuAbD@*fn63MOo!`(t6$4DcgluSxV2x0n4 zhe{b~H}zZ%^wa#SDI-J2m-Xpgqv3*x;k2P3009U<00Izz00bZa0SG|gzY3HG44zpH zXgP@YHlx${G)$sW5*|i{&9HP(3{T=@JKWel+S@6`XHi&LRBHvBSq`T1fz0M+gEjf1 z?c`!k*4k3`^;VAU?JqA-o(Tea-jfv8i&3qIYTYyRJLadR=YwyK_FR3qoX*Z=bNNB( zxf}b%k?J^F9mlHUcy*l6(d$?L=3HD6@hYCgy|@)O;yR5e2tWV=5P$##AOHafKmY;| zfB*!3Rlw(*s&Z_*zO~!8b+RnKrOUT;GR?Z?PT$mtF|Vn^XF3^%+nDhT?z(ChN3?sC zJC2(3hBhz#7B23Hco%QtMLdf~@sLIo1Rwwb2tWV=5P$##AOHafKmY=NLBL{^+FW(Z SR^7BzXQt`~Gnt!q=C~gME1jeO literal 0 HcmV?d00001 diff --git a/modin/core/dataframe/algebra/default2pandas/binary.py b/modin/core/dataframe/algebra/default2pandas/binary.py index a6e89c4d760..b834e948c8c 100644 --- a/modin/core/dataframe/algebra/default2pandas/binary.py +++ b/modin/core/dataframe/algebra/default2pandas/binary.py @@ -47,6 +47,7 @@ def bin_ops_wrapper(df, other, *args, **kwargs): "squeeze_other", False ) squeeze_self = kwargs.pop("squeeze_self", False) + if squeeze_other: other = other.squeeze(axis=1) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 691c2c7d698..a073195a9a4 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -374,7 +374,7 @@ def has_materialized_dtypes(self): bool True if if the undelying modin frame and False otherwise. """ - return self.has_materialized_dtypes() + return self._modin_frame.has_materialized_dtypes def set_frame_dtypes_cache(self, dtypes): """ @@ -384,7 +384,7 @@ def set_frame_dtypes_cache(self, dtypes): ---------- dtypes : pandas.Series, ModinDtypes, callable or None """ - self.set_frame_dtypes_cache(dtypes) + self._modin_frame.set_dtypes_cache(dtypes) def has_dtypes_cache(self) -> bool: """ diff --git a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py index d6767478926..3c96f8227ab 100644 --- a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py +++ b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py @@ -18,6 +18,8 @@ queries for small data and empty ``PandasDataFrame``. """ +from typing import Optional + import numpy as np import pandas from pandas.core.dtypes.common import is_list_like, is_scalar @@ -587,7 +589,11 @@ def caller(query_compiler, *args, **kwargs): args = try_cast_to_pandas(args, squeeze=squeeze_args) kwargs = try_cast_to_pandas(kwargs, squeeze=squeeze_kwargs) result = func(df, *args, **kwargs) + inplace_method = kwargs.get("inplace", False) + if in_place: + inplace_method = in_place + if inplace_method: result = df if not (return_modin or isinstance(result, (pandas.Series, pandas.DataFrame))): return result @@ -692,7 +698,6 @@ def setitem_bool(self, row_loc, col_loc, item): cumprod = _register_default_pandas(pandas.DataFrame.cumprod) cumsum = _register_default_pandas(pandas.DataFrame.cumsum) delitem = _register_default_pandas(_delitem) - describe = _register_default_pandas(pandas.DataFrame.describe) df_update = _register_default_pandas( pandas.DataFrame.update, in_place=True, df_copy=True ) @@ -855,7 +860,7 @@ def setitem_bool(self, row_loc, col_loc, item): ) isna = _register_default_pandas(pandas.DataFrame.isna) join = _register_default_pandas(pandas.DataFrame.join) - kurt = _register_default_pandas(pandas.DataFrame.kurt) + kurt = _register_default_pandas(pandas.DataFrame.kurt, return_modin=False) last_valid_index = _register_default_pandas( pandas.DataFrame.last_valid_index, return_modin=False ) @@ -866,7 +871,7 @@ def setitem_bool(self, row_loc, col_loc, item): max = _register_default_pandas(pandas.DataFrame.max) map = _register_default_pandas(pandas.DataFrame.map) mean = _register_default_pandas(pandas.DataFrame.mean, return_modin=False) - median = _register_default_pandas(pandas.DataFrame.median) + median = _register_default_pandas(pandas.DataFrame.median, return_modin=False) melt = _register_default_pandas(pandas.DataFrame.melt) memory_usage = _register_default_pandas(pandas.DataFrame.memory_usage) merge = _register_default_pandas(pandas.DataFrame.merge) @@ -964,7 +969,7 @@ def setitem_bool(self, row_loc, col_loc, item): series_view = _register_default_pandas(pandas.Series.view, is_series=True) set_index_from_columns = _register_default_pandas(pandas.DataFrame.set_index) setitem = _register_default_pandas(_setitem) - skew = _register_default_pandas(pandas.DataFrame.skew) + skew = _register_default_pandas(pandas.DataFrame.skew, return_modin=False) sort_index = _register_default_pandas(_sort_index) sort_columns_by_row_values = _register_default_pandas( lambda df, columns, **kwargs: df.sort_values(by=columns, axis=1, **kwargs) @@ -1057,6 +1062,13 @@ def clip(self, lower, upper, **kwargs): self, lower, upper, **kwargs ) + def describe(self, percentiles: np.ndarray): + return _register_default_pandas(pandas.DataFrame.describe)( + self, + percentiles=percentiles, + include="all", + ) + def dot(self, other, squeeze_self=None, squeeze_other=None): other = try_cast_to_pandas(other) if squeeze_other: @@ -1223,3 +1235,24 @@ def getitem_column_array(self, key, numeric=False, ignore_order=False): def is_series_like(self): return len(self._modin_frame.columns) == 1 or len(self._modin_frame.index) == 1 + + def support_materialization_in_worker_process(self) -> bool: + """ + Whether it's possible to call function `to_pandas` during the pickling process, at the moment of recreating the object. + + Returns + ------- + bool + """ + return True + + def get_pandas_backend(self) -> Optional[str]: + """ + Get backend stored in `_modin_frame`. + + Returns + ------- + str | None + Backend name. + """ + return None diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 749cf0f6a50..4df41168b5a 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -31,10 +31,7 @@ from pandas.io.formats.info import SeriesInfo from pandas.util._validators import validate_bool_kwarg -from modin.config import PersistentPickle, UsePlainPandasQueryCompiler -from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - PlainPandasQueryCompiler, -) +from modin.config import PersistentPickle from modin.logging import disable_logging from modin.pandas.io import from_pandas, to_pandas from modin.utils import ( @@ -147,32 +144,18 @@ def __init__( name = MODIN_UNNAMED_SERIES_LABEL if isinstance(data, pandas.Series) and data.name is not None: name = data.name - if UsePlainPandasQueryCompiler.get(): - query_compiler = PlainPandasQueryCompiler( - pandas.DataFrame( - pandas.Series( - data=data, - index=index, - dtype=dtype, - name=name, - copy=copy, - fastpath=fastpath, - ) + query_compiler = from_pandas( + pandas.DataFrame( + pandas.Series( + data=data, + index=index, + dtype=dtype, + name=name, + copy=copy, + fastpath=fastpath, ) ) - else: - query_compiler = from_pandas( - pandas.DataFrame( - pandas.Series( - data=data, - index=index, - dtype=dtype, - name=name, - copy=copy, - fastpath=fastpath, - ) - ) - )._query_compiler + )._query_compiler self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name From 3cf940abe1145093dd91acf31de568d9f0abebf1 Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Wed, 29 May 2024 04:32:09 -0500 Subject: [PATCH 11/19] All tests in modin/tests/pandas/dataframe/ passing --- .../core/storage_formats/pandas/small_query_compiler.py | 2 +- modin/pandas/io.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py index 3c96f8227ab..bfccb0fd105 100644 --- a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py +++ b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py @@ -1244,7 +1244,7 @@ def support_materialization_in_worker_process(self) -> bool: ------- bool """ - return True + return False def get_pandas_backend(self) -> Optional[str]: """ diff --git a/modin/pandas/io.py b/modin/pandas/io.py index 5bb599dd749..ab309af54c0 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -995,7 +995,8 @@ def from_pandas(df) -> DataFrame: from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if UsePlainPandasQueryCompiler.get(): - return ModinObjects.DataFrame(query_compiler=PlainPandasQueryCompiler(df)) + df_copy = df.copy() + return ModinObjects.DataFrame(query_compiler=PlainPandasQueryCompiler(df_copy)) return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_pandas(df)) From c85e7088c568fe13bfc9c7fe20577834b9d4ce95 Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Wed, 29 May 2024 06:50:05 -0500 Subject: [PATCH 12/19] PR comments --- 629f1c86196011ef9edda4bf0168e181.db | Bin 12288 -> 0 bytes .../dispatching/factories/factories.py | 8 ++++++++ modin/pandas/base.py | 5 ----- modin/pandas/io.py | 9 +-------- 4 files changed, 9 insertions(+), 13 deletions(-) delete mode 100644 629f1c86196011ef9edda4bf0168e181.db diff --git a/629f1c86196011ef9edda4bf0168e181.db b/629f1c86196011ef9edda4bf0168e181.db deleted file mode 100644 index f2fd394d3ec40e5d5ec84376497ee34a440c718d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI#yGq1B6b9g#Op-}rbiA(@guzNh#B1*@$bzn@3)+QMBL-JRH(sz?Eqx20z$fqt zd;x8&1i?xWY%I(yQ@Dkzl_35Ha%O%qnMuAbD@*fn63MOo!`(t6$4DcgluSxV2x0n4 zhe{b~H}zZ%^wa#SDI-J2m-Xpgqv3*x;k2P3009U<00Izz00bZa0SG|gzY3HG44zpH zXgP@YHlx${G)$sW5*|i{&9HP(3{T=@JKWel+S@6`XHi&LRBHvBSq`T1fz0M+gEjf1 z?c`!k*4k3`^;VAU?JqA-o(Tea-jfv8i&3qIYTYyRJLadR=YwyK_FR3qoX*Z=bNNB( zxf}b%k?J^F9mlHUcy*l6(d$?L=3HD6@hYCgy|@)O;yR5e2tWV=5P$##AOHafKmY;| zfB*!3Rlw(*s&Z_*zO~!8b+RnKrOUT;GR?Z?PT$mtF|Vn^XF3^%+nDhT?z(ChN3?sC zJC2(3hBhz#7B23Hco%QtMLdf~@sLIo1Rwwb2tWV=5P$##AOHafKmY=NLBL{^+FW(Z SR^7BzXQt`~Gnt!q=C~gME1jeO diff --git a/modin/core/execution/dispatching/factories/factories.py b/modin/core/execution/dispatching/factories/factories.py index e7f2493e404..9b71a067159 100644 --- a/modin/core/execution/dispatching/factories/factories.py +++ b/modin/core/execution/dispatching/factories/factories.py @@ -26,7 +26,12 @@ import pandas from pandas.util._decorators import doc +from modin.config import UsePlainPandasQueryCompiler + from modin.core.io import BaseIO +from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( + PlainPandasQueryCompiler, +) from modin.utils import get_current_execution _doc_abstract_factory_class = """ @@ -168,6 +173,9 @@ def prepare(cls): method="io.from_pandas", ) def _from_pandas(cls, df): + if UsePlainPandasQueryCompiler.get(): + df_copy = df.copy() + return PlainPandasQueryCompiler(df_copy) return cls.io_cls.from_pandas(df) @classmethod diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 1be5c83390d..f7eebcd30f2 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -66,9 +66,6 @@ from modin import pandas as pd from modin.error_message import ErrorMessage -from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - PlainPandasQueryCompiler, -) from modin.logging import ClassLogger, disable_logging from modin.pandas.accessor import CachedAccessor, ModinAPI from modin.pandas.utils import is_scalar @@ -286,8 +283,6 @@ def _build_repr_df( indexer = row_indexer, _get_repr_axis_label_indexer(self.columns, num_cols) else: indexer = row_indexer - if isinstance(self._query_compiler, PlainPandasQueryCompiler): - return self._query_compiler.to_pandas().iloc[indexer] return self.iloc[indexer]._query_compiler.to_pandas() def _update_inplace(self, new_query_compiler: BaseQueryCompiler) -> None: diff --git a/modin/pandas/io.py b/modin/pandas/io.py index ab309af54c0..508d1b2a4d5 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -64,11 +64,8 @@ from pandas.io.parsers import TextFileReader from pandas.io.parsers.readers import _c_parser_defaults -from modin.config import ModinNumpy, UsePlainPandasQueryCompiler +from modin.config import ModinNumpy from modin.error_message import ErrorMessage -from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - PlainPandasQueryCompiler, -) from modin.logging import ClassLogger, enable_logging from modin.utils import ( SupportsPrivateToNumPy, @@ -994,10 +991,6 @@ def from_pandas(df) -> DataFrame: """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher - if UsePlainPandasQueryCompiler.get(): - df_copy = df.copy() - return ModinObjects.DataFrame(query_compiler=PlainPandasQueryCompiler(df_copy)) - return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_pandas(df)) From 88c4354970fefb868d39b37bd46f956fa93229af Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Wed, 5 Jun 2024 05:16:14 -0500 Subject: [PATCH 13/19] renaming to PlainPandasQueryCompiler to NativeDataframeMode --- .github/workflows/ci.yml | 17 +---- modin/config/__init__.py | 4 +- modin/config/envvars.py | 20 ++++-- .../dispatching/factories/factories.py | 11 ++-- ...y_compiler.py => native_query_compiler.py} | 63 +++++++++++++++++-- modin/tests/pandas/dataframe/test_binary.py | 6 +- modin/tests/pandas/dataframe/test_default.py | 27 ++++---- modin/tests/pandas/dataframe/test_indexing.py | 12 ++-- .../tests/pandas/dataframe/test_join_sort.py | 11 ++-- .../pandas/dataframe/test_map_metadata.py | 17 +++-- modin/tests/pandas/test_expanding.py | 6 +- 11 files changed, 121 insertions(+), 73 deletions(-) rename modin/experimental/core/storage_formats/pandas/{small_query_compiler.py => native_query_compiler.py} (97%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1108fd6ffa7..ecedb33b929 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -647,7 +647,7 @@ jobs: matrix: python-version: ["3.9"] env: - MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER: "True" + MODIN_NATIVE_DATAFRAME_MODE: "Native_pandas" name: test-small-query-compiler python ${{matrix.python-version}}) steps: - uses: actions/checkout@v4 @@ -667,21 +667,6 @@ jobs: - run: python -m pytest modin/tests/pandas/dataframe/test_reduce.py - run: python -m pytest modin/tests/pandas/dataframe/test_udf.py - run: python -m pytest modin/tests/pandas/dataframe/test_window.py - - run: python -m pytest modin/tests/pandas/extensions/test_dataframe_extensions.py - - run: python -m pytest modin/tests/pandas/extensions/test_pd_extensions.py - - run: python -m pytest modin/tests/pandas/extensions/test_series_extensions.py - - run: python -m pytest modin/tests/pandas/integrations/test_lazy_import.py - - run: python -m pytest modin/tests/pandas/internals/test_benchmark_mode.py - - run: python -m pytest modin/tests/pandas/internals/test_repartition.py - - run: python -m pytest modin/tests/pandas/test_api.py - - run: python -m pytest modin/tests/pandas/test_concat.py - - run: python -m pytest modin/tests/pandas/test_expanding.py - - run: python -m pytest modin/tests/pandas/test_general.py - - run: python -m pytest modin/tests/pandas/test_groupby.py - - run: python -m pytest modin/tests/pandas/test_io.py - - run: python -m pytest modin/tests/pandas/test_reshape.py - - run: python -m pytest modin/tests/pandas/test_rolling.py - - run: python -m pytest modin/tests/pandas/test_series.py - uses: codecov/codecov-action@v2 merge-coverage-artifacts: diff --git a/modin/config/__init__.py b/modin/config/__init__.py index b3c7f8f54f0..bcacded395f 100644 --- a/modin/config/__init__.py +++ b/modin/config/__init__.py @@ -39,6 +39,7 @@ MinPartitionSize, MinRowPartitionSize, ModinNumpy, + NativeDataframeMode, NPartitions, PersistentPickle, ProgressBar, @@ -53,7 +54,6 @@ TestReadFromPostgres, TestReadFromSqlServer, TrackFileLeaks, - UsePlainPandasQueryCompiler, ) from modin.config.pubsub import Parameter, ValueSource, context @@ -69,7 +69,7 @@ "CpuCount", "GpuCount", "Memory", - "UsePlainPandasQueryCompiler", + "NativeDataframeMode", # Ray specific "IsRayCluster", "RayRedisAddress", diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 3010dc28bdc..c2feb1841b4 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -913,11 +913,23 @@ def _check_vars() -> None: ) -class UsePlainPandasQueryCompiler(EnvironmentVariable, type=bool): - """Set to true to use implementation of PlainPandasQueryCompiler.""" +class NativeDataframeMode(EnvironmentVariable, type=str): + """ + The mode of execution used for handling dataframes in Modin - varname = "MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER" - default = False + When the env variable is set to None the PandasQueryCompiler would be used + which would lead to modin executing dataframes in distributed fashion. + When set to Native_pandas NativeQueryCompiler is used which handles the + dataframes without distributing, falling back to native pandas functions. + + In future more execution modes can be added for single node execution so + keeping the parameter as string. + + """ + + varname = "MODIN_NATIVE_DATAFRAME_MODE" + choices = ("Native_pandas",) + default = None _check_vars() diff --git a/modin/core/execution/dispatching/factories/factories.py b/modin/core/execution/dispatching/factories/factories.py index 9b71a067159..ee1b68b2dee 100644 --- a/modin/core/execution/dispatching/factories/factories.py +++ b/modin/core/execution/dispatching/factories/factories.py @@ -26,11 +26,10 @@ import pandas from pandas.util._decorators import doc -from modin.config import UsePlainPandasQueryCompiler - +from modin.config import NativeDataframeMode from modin.core.io import BaseIO -from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - PlainPandasQueryCompiler, +from modin.experimental.core.storage_formats.pandas.native_query_compiler import ( + NativeQueryCompiler, ) from modin.utils import get_current_execution @@ -173,9 +172,9 @@ def prepare(cls): method="io.from_pandas", ) def _from_pandas(cls, df): - if UsePlainPandasQueryCompiler.get(): + if NativeDataframeMode.get(): df_copy = df.copy() - return PlainPandasQueryCompiler(df_copy) + return NativeQueryCompiler(df_copy) return cls.io_cls.from_pandas(df) @classmethod diff --git a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py b/modin/experimental/core/storage_formats/pandas/native_query_compiler.py similarity index 97% rename from modin/experimental/core/storage_formats/pandas/small_query_compiler.py rename to modin/experimental/core/storage_formats/pandas/native_query_compiler.py index bfccb0fd105..9f8cf19295b 100644 --- a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py +++ b/modin/experimental/core/storage_formats/pandas/native_query_compiler.py @@ -12,9 +12,9 @@ # governing permissions and limitations under the License. """ -Module contains ``PlainPandasQueryCompiler`` class. +Module contains ``NativeQueryCompiler`` class. -``PlainPandasQueryCompiler`` is responsible for compiling efficient DataFrame algebra +``NativeQueryCompiler`` is responsible for compiling efficient DataFrame algebra queries for small data and empty ``PandasDataFrame``. """ @@ -24,7 +24,7 @@ import pandas from pandas.core.dtypes.common import is_list_like, is_scalar -from modin.config.envvars import UsePlainPandasQueryCompiler +from modin.config.envvars import NativeDataframeMode from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler from modin.utils import ( MODIN_UNNAMED_SERIES_LABEL, @@ -608,7 +608,7 @@ def caller(query_compiler, *args, **kwargs): @_inherit_docstrings(BaseQueryCompiler) -class PlainPandasQueryCompiler(BaseQueryCompiler): +class NativeQueryCompiler(BaseQueryCompiler): """ Query compiler for the pandas storage format. @@ -623,7 +623,7 @@ class PlainPandasQueryCompiler(BaseQueryCompiler): """ def __init__(self, pandas_frame): - assert UsePlainPandasQueryCompiler.get() + assert NativeDataframeMode.get() == "Native_Pandas" if hasattr(pandas_frame, "_to_pandas"): pandas_frame = pandas_frame._to_pandas() if is_scalar(pandas_frame): @@ -636,6 +636,59 @@ def __init__(self, pandas_frame): def execute(self): pass + @property + def frame_has_materialized_dtypes(self) -> bool: + """ + Check if the undelying dataframe has materialized dtypes. + + Returns + ------- + bool + """ + return True + + def set_frame_dtypes_cache(self, dtypes): + """ + Set dtypes cache for the underlying dataframe frame. + + Parameters + ---------- + dtypes : pandas.Series, ModinDtypes, callable or None + """ + pass + + def set_frame_index_cache(self, index): + """ + Set index cache for underlying dataframe. + + Parameters + ---------- + index : sequence, callable or None + """ + pass + + @property + def frame_has_index_cache(self): + """ + Check if the index cache exists for underlying dataframe. + + Returns + ------- + bool + """ + return True + + @property + def frame_has_dtypes_cache(self) -> bool: + """ + Check if the dtypes cache exists for the underlying dataframe. + + Returns + ------- + bool + """ + return True + def take_2d_positional(self, index=None, columns=None): index = slice(None) if index is None else index columns = slice(None) if columns is None else columns diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index 17351b21839..1b643cfcdba 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -17,7 +17,7 @@ import pytest import modin.pandas as pd -from modin.config import NPartitions, StorageFormat, UsePlainPandasQueryCompiler +from modin.config import NativeDataframeMode, NPartitions, StorageFormat from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) @@ -211,8 +211,8 @@ def operation(df): reason="Modin on this engine doesn't create virtual partitions.", ) @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler does not contain partitions.", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler does not contain partitions.", ) @pytest.mark.parametrize( "left_virtual,right_virtual", [(True, False), (False, True), (True, True)] diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 50e11b4068b..016ee2e7ac9 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -23,7 +23,7 @@ from numpy.testing import assert_array_equal import modin.pandas as pd -from modin.config import Engine, NPartitions, StorageFormat, UsePlainPandasQueryCompiler +from modin.config import Engine, NativeDataframeMode, NPartitions, StorageFormat from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( axis_keys, @@ -90,7 +90,7 @@ def test_ops_defaulting_to_pandas(op, make_args): modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"], axis=1) with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): operation = getattr(modin_df, op) @@ -108,7 +108,7 @@ def test_style(): data = test_data_values[0] with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): pd.DataFrame(data).style @@ -120,7 +120,7 @@ def test_to_timestamp(): with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): df.to_period().to_timestamp() @@ -137,8 +137,8 @@ def test_to_numpy(data): @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler does not contain partitions.", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler does not contain partitions.", ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_partition_to_numpy(data): @@ -153,7 +153,7 @@ def test_asfreq(): df = pd.DataFrame({"s": series}) with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): # We are only testing that this defaults to pandas, so we will just check for @@ -315,7 +315,7 @@ def test_corr_min_periods(self, min_periods): {"a": [1, np.nan, 3, 4, 5, 6], "b": [1, 2, 1, 4, 5, np.nan]} ) modin_df = pd.concat([modin_df.iloc[:3], modin_df.iloc[3:]]) - if not UsePlainPandasQueryCompiler.get(): + if not NativeDataframeMode.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( modin_df, pandas_df, lambda df: df.corr(min_periods=min_periods) @@ -335,8 +335,8 @@ def test_corr_non_numeric(self, numeric_only): reason="doesn't make sense for non-partitioned executions", ) @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler does not contain partitions.", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler does not contain partitions.", ) def test_corr_nans_in_different_partitions(self): # NaN in the first partition @@ -628,10 +628,7 @@ def test_pivot(data, index, columns, values, request): or "default-one_column-several_columns_index" in request.node.callspec.id or "default-one_column-one_column_index" in request.node.callspec.id or ( - ( - current_execution in ("BaseOnPython",) - or UsePlainPandasQueryCompiler.get() - ) + (current_execution in ("BaseOnPython",) or NativeDataframeMode.get()) and index is lib.no_default ) ): @@ -1012,7 +1009,7 @@ def test_resampler_functions_with_arg(rule, axis, method_arg): marks=pytest.mark.xfail( condition=Engine.get() in ("Ray", "Unidist", "Dask", "Python") and StorageFormat.get() != "Base" - and not UsePlainPandasQueryCompiler.get(), + and NativeDataframeMode.get() is None, reason="https://github.com/modin-project/modin/issues/6399", ), ), diff --git a/modin/tests/pandas/dataframe/test_indexing.py b/modin/tests/pandas/dataframe/test_indexing.py index 506ac2bb774..0f38eaa5ebe 100644 --- a/modin/tests/pandas/dataframe/test_indexing.py +++ b/modin/tests/pandas/dataframe/test_indexing.py @@ -21,7 +21,11 @@ from pandas._testing import ensure_clean import modin.pandas as pd +<<<<<<< HEAD from modin.config import MinRowPartitionSize, NPartitions, UsePlainPandasQueryCompiler +======= +from modin.config import MinPartitionSize, NativeDataframeMode, NPartitions +>>>>>>> 1984aa1f (renaming to PlainPandasQueryCompiler to NativeDataframeMode) from modin.pandas.indexing import is_range_like from modin.pandas.testing import assert_index_equal from modin.tests.pandas.utils import ( @@ -584,8 +588,8 @@ def test_loc_setting_single_categorical_column(): @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler does not currently support IO functions.", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler does not currently support IO functions.", ) def test_loc_multi_index(): modin_df = pd.read_csv( @@ -2241,8 +2245,8 @@ def test___setitem__partitions_aligning(): @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler does not currently support IO functions.", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler does not currently support IO functions.", ) def test___setitem__with_mismatched_partitions(): with ensure_clean(".csv") as fname: diff --git a/modin/tests/pandas/dataframe/test_join_sort.py b/modin/tests/pandas/dataframe/test_join_sort.py index 8343468b1df..46983ffd45c 100644 --- a/modin/tests/pandas/dataframe/test_join_sort.py +++ b/modin/tests/pandas/dataframe/test_join_sort.py @@ -20,7 +20,7 @@ import pytest import modin.pandas as pd -from modin.config import Engine, NPartitions, StorageFormat, UsePlainPandasQueryCompiler +from modin.config import Engine, NativeDataframeMode, NPartitions, StorageFormat from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( arg_keys, @@ -613,7 +613,7 @@ def test_sort_multiindex(sort_remaining): for kwargs in [{"level": 0}, {"axis": 0}, {"axis": 1}]: with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): df_equals( @@ -737,7 +737,7 @@ def test_sort_values_descending_with_only_two_bins(): modin_df = pd.concat([part1, part2]) pandas_df = modin_df._to_pandas() - if StorageFormat.get() == "Pandas" and not UsePlainPandasQueryCompiler.get(): + if StorageFormat.get() == "Pandas" and not NativeDataframeMode.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( @@ -777,7 +777,7 @@ def test_sort_values_with_one_partition(ascending): np.array([["hello", "goodbye"], ["hello", "Hello"]]) ) - if StorageFormat.get() == "Pandas" and not UsePlainPandasQueryCompiler.get(): + if StorageFormat.get() == "Pandas" and not NativeDataframeMode.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (1, 1) eval_general( @@ -897,7 +897,8 @@ def test_sort_values_with_only_one_non_na_row_in_partition(ascending, na_positio @pytest.mark.skipif( - Engine.get() not in ("Ray", "Unidist", "Dask") or UsePlainPandasQueryCompiler.get(), + Engine.get() not in ("Ray", "Unidist", "Dask") + or NativeDataframeMode.get() is not None, reason="We only need to test this case where sort does not default to pandas.", ) def test_sort_values_with_sort_key_on_partition_boundary(): diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index bde65a9b845..40c910ed4cc 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -21,9 +21,9 @@ import modin.pandas as pd from modin.config import ( MinRowPartitionSize, + NativeDataframeMode, NPartitions, StorageFormat, - UsePlainPandasQueryCompiler, ) from modin.core.dataframe.pandas.metadata import LazyProxyCategoricalDtype from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas @@ -304,10 +304,7 @@ def test_copy(data): assert new_modin_df.columns is not modin_df.columns assert new_modin_df.dtypes is not modin_df.dtypes - if ( - get_current_execution() != "BaseOnPython" - and not UsePlainPandasQueryCompiler.get() - ): + if get_current_execution() != "BaseOnPython" and not NativeDataframeMode.get(): assert np.array_equal( new_modin_df._query_compiler._modin_frame._partitions, modin_df._query_compiler._modin_frame._partitions, @@ -574,8 +571,8 @@ def test_astype_int64_to_astype_category_github_issue_6259(): reason="BaseOnPython doesn't have proxy categories", ) @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler doesn't have proxy categories", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler doesn't have proxy categories", ) class TestCategoricalProxyDtype: """This class contains test and test usilities for the ``LazyProxyCategoricalDtype`` class.""" @@ -800,8 +797,8 @@ def comparator(df1, df2): @pytest.mark.skipif( - UsePlainPandasQueryCompiler.get(), - reason="PlainPandasQueryCompiler does not contain partitions.", + NativeDataframeMode.get() is not None, + reason="NativeQueryCompiler does not contain partitions.", ) def test_convert_dtypes_multiple_row_partitions(): # Column 0 should have string dtype @@ -827,7 +824,7 @@ def test_convert_dtypes_5653(): modin_part1 = pd.DataFrame({"col1": ["a", "b", "c", "d"]}) modin_part2 = pd.DataFrame({"col1": [None, None, None, None]}) modin_df = pd.concat([modin_part1, modin_part2]) - if StorageFormat.get() == "Pandas" and not UsePlainPandasQueryCompiler.get(): + if StorageFormat.get() == "Pandas" and not NativeDataframeMode.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) modin_df = modin_df.convert_dtypes() assert len(modin_df.dtypes) == 1 diff --git a/modin/tests/pandas/test_expanding.py b/modin/tests/pandas/test_expanding.py index 5a962061e47..d96a38bc21e 100644 --- a/modin/tests/pandas/test_expanding.py +++ b/modin/tests/pandas/test_expanding.py @@ -18,7 +18,7 @@ import pytest import modin.pandas as pd -from modin.config import NPartitions, UsePlainPandasQueryCompiler +from modin.config import NativeDataframeMode, NPartitions from modin.tests.test_utils import warns_that_defaulting_to_pandas from .utils import ( @@ -71,7 +71,7 @@ def test_dataframe(data, min_periods, axis, method, kwargs): def test_dataframe_corr_cov(data, min_periods, axis, method): with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): eval_general( @@ -87,7 +87,7 @@ def test_dataframe_corr_cov_with_self(method): mdf, pdf = create_test_dfs(test_data["float_nan_data"]) with ( warns_that_defaulting_to_pandas() - if not UsePlainPandasQueryCompiler.get() + if not NativeDataframeMode.get() else contextlib.nullcontext() ): eval_general( From b09d0f7d2c66816bac5f91700153dd1104f2058b Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Wed, 5 Jun 2024 05:16:14 -0500 Subject: [PATCH 14/19] renaming to PlainPandasQueryCompiler to NativeDataframeMode --- .../pandas/native_query_compiler.py | 76 ++++++------------- 1 file changed, 24 insertions(+), 52 deletions(-) diff --git a/modin/experimental/core/storage_formats/pandas/native_query_compiler.py b/modin/experimental/core/storage_formats/pandas/native_query_compiler.py index 9f8cf19295b..8679dc318dc 100644 --- a/modin/experimental/core/storage_formats/pandas/native_query_compiler.py +++ b/modin/experimental/core/storage_formats/pandas/native_query_compiler.py @@ -336,15 +336,6 @@ def _register_binary(op): """ def binary_operator(df, other, **kwargs): - - # if isinstance(other, pandas.DataFrame) and ( - # not df.empty - # or ( - # len(other.columns) == 1 - # and other.columns[0] == MODIN_UNNAMED_SERIES_LABEL - # ) - # ): - # other = other.squeeze() squeeze_other = kwargs.pop("broadcast", False) or kwargs.pop( "squeeze_other", False ) @@ -369,27 +360,8 @@ def binary_operator(df, other, **kwargs): return binary_operator -def _register_exanding(func): - def binary_operator(df, fold_axis, rolling_args, *args, **kwargs): - # if - # other_for_default = ( - # other - # if other is None - # else ( - # other.to_pandas().squeeze(axis=1) - # if squeeze_other - # else other.to_pandas() - # ) - # ) - - # if isinstance(other, pandas.DataFrame) and ( - # not df.empty - # or ( - # len(other.columns) == 1 - # and other.columns[0] == MODIN_UNNAMED_SERIES_LABEL - # ) - # ): - # other = other.squeeze() +def _register_expanding(func): + def expanding_operator(df, fold_axis, rolling_args, *args, **kwargs): squeeze_self = kwargs.pop("squeeze_self", False) if squeeze_self: @@ -400,7 +372,7 @@ def binary_operator(df, fold_axis, rolling_args, *args, **kwargs): return func(roller, *args, **kwargs) - return binary_operator + return expanding_operator def _register_resample(op): @@ -485,7 +457,7 @@ def _write_items( df, row_numeric_index, col_numeric_index, - broadcasted_items, + item, need_columns_reindex=True, ): # noqa: GL08 from modin.pandas.utils import broadcast_item, is_scalar @@ -494,16 +466,16 @@ def _write_items( row_numeric_index = list(row_numeric_index) if not isinstance(col_numeric_index, slice): col_numeric_index = list(col_numeric_index) - if not is_scalar(broadcasted_items): + if not is_scalar(item): broadcasted_items, _ = broadcast_item( df, row_numeric_index, col_numeric_index, - broadcasted_items, + item, need_columns_reindex=need_columns_reindex, ) else: - broadcasted_items = broadcasted_items + broadcasted_items = item if isinstance(df.iloc[row_numeric_index, col_numeric_index], pandas.Series): broadcasted_items = broadcasted_items.squeeze() @@ -817,46 +789,46 @@ def setitem_bool(self, row_loc, col_loc, item): eval = _register_default_pandas(pandas.DataFrame.eval) explode = _register_default_pandas(pandas.DataFrame.explode) expanding_count = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.count) + _register_expanding(pandas.core.window.expanding.Expanding.count) ) expanding_sum = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.sum) + _register_expanding(pandas.core.window.expanding.Expanding.sum) ) expanding_mean = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.mean) + _register_expanding(pandas.core.window.expanding.Expanding.mean) ) expanding_median = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.median) + _register_expanding(pandas.core.window.expanding.Expanding.median) ) expanding_std = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.std) + _register_expanding(pandas.core.window.expanding.Expanding.std) ) expanding_min = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.min) + _register_expanding(pandas.core.window.expanding.Expanding.min) ) expanding_max = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.max) + _register_expanding(pandas.core.window.expanding.Expanding.max) ) expanding_skew = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.skew) + _register_expanding(pandas.core.window.expanding.Expanding.skew) ) expanding_kurt = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.kurt) + _register_expanding(pandas.core.window.expanding.Expanding.kurt) ) expanding_sem = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.sem) + _register_expanding(pandas.core.window.expanding.Expanding.sem) ) expanding_quantile = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.quantile) + _register_expanding(pandas.core.window.expanding.Expanding.quantile) ) expanding_aggregate = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.aggregate) + _register_expanding(pandas.core.window.expanding.Expanding.aggregate) ) expanding_var = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.var) + _register_expanding(pandas.core.window.expanding.Expanding.var) ) expanding_rank = _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.rank) + _register_expanding(pandas.core.window.expanding.Expanding.rank) ) fillna = _register_default_pandas(_fillna) @@ -1162,10 +1134,10 @@ def expanding_cov( else other.to_pandas() ) ) - # expanding_rank = _register_default_pandas(_register_exanding(pandas.core.window.expanding.Expanding.rank)) + # expanding_rank = _register_default_pandas(_register_expanding(pandas.core.window.expanding.Expanding.rank)) return _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.cov) + _register_expanding(pandas.core.window.expanding.Expanding.cov) )( self, fold_axis, @@ -1200,7 +1172,7 @@ def expanding_corr( ) ) return _register_default_pandas( - _register_exanding(pandas.core.window.expanding.Expanding.corr) + _register_expanding(pandas.core.window.expanding.Expanding.corr) )( self, fold_axis, From e0590cb16a50ba965ac1f579bf752c54205752ab Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Mon, 10 Jun 2024 03:46:39 -0500 Subject: [PATCH 15/19] PR comments + changes --- .github/workflows/ci.yml | 2 +- modin/config/envvars.py | 5 +- .../storage_formats/base/query_compiler.py | 11 --- .../storage_formats/pandas/query_compiler.py | 31 --------- .../pandas/native_query_compiler.py | 68 +++++++++---------- modin/pandas/series.py | 1 + modin/tests/pandas/dataframe/test_default.py | 25 ++----- .../tests/pandas/dataframe/test_join_sort.py | 7 +- modin/tests/pandas/test_expanding.py | 16 +---- modin/tests/test_utils.py | 15 +++- 10 files changed, 58 insertions(+), 123 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ecedb33b929..3125a1d094e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -667,7 +667,7 @@ jobs: - run: python -m pytest modin/tests/pandas/dataframe/test_reduce.py - run: python -m pytest modin/tests/pandas/dataframe/test_udf.py - run: python -m pytest modin/tests/pandas/dataframe/test_window.py - - uses: codecov/codecov-action@v2 + - uses: ./.github/actions/upload-coverage merge-coverage-artifacts: needs: [test-internals, test-api-and-no-engine, test-defaults, test-all-unidist, test-all, test-experimental, test-sanity] diff --git a/modin/config/envvars.py b/modin/config/envvars.py index c2feb1841b4..1d3f4260f8f 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -915,16 +915,15 @@ def _check_vars() -> None: class NativeDataframeMode(EnvironmentVariable, type=str): """ - The mode of execution used for handling dataframes in Modin + The mode of execution used for handling dataframes in Modin. When the env variable is set to None the PandasQueryCompiler would be used - which would lead to modin executing dataframes in distributed fashion. + which would lead to Modin executing dataframes in distributed fashion. When set to Native_pandas NativeQueryCompiler is used which handles the dataframes without distributing, falling back to native pandas functions. In future more execution modes can be added for single node execution so keeping the parameter as string. - """ varname = "MODIN_NATIVE_DATAFRAME_MODE" diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index cb4ed623b32..80e89a577a2 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -4617,17 +4617,6 @@ def frame_has_dtypes_cache(self) -> bool: """ return self._modin_frame.has_dtypes_cache - def has_dtypes_cache(self) -> bool: - """ - Check if the dtypes cache exists for the underlying modin frame. - - Returns - ------- - bool - True for base class as dtypes are always present - """ - return True - def get_index_name(self, axis=0): """ Get index name of specified axis. diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index a073195a9a4..7c4f7e79f55 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -365,37 +365,6 @@ def copy(self): # END Copy - def has_materialized_dtypes(self): - """ - Check if the undelying modin frame has materialized dtypes - - Returns - ------- - bool - True if if the undelying modin frame and False otherwise. - """ - return self._modin_frame.has_materialized_dtypes - - def set_frame_dtypes_cache(self, dtypes): - """ - Set dtypes cache for the underlying modin frame. - - Parameters - ---------- - dtypes : pandas.Series, ModinDtypes, callable or None - """ - self._modin_frame.set_dtypes_cache(dtypes) - - def has_dtypes_cache(self) -> bool: - """ - Check if the dtypes cache exists for the underlying modin frame. - - Returns - ------- - bool - """ - return self._modin_frame.has_dtypes_cache - # Append/Concat/Join (Not Merge) # The append/concat/join operations should ideally never trigger remote # compute. These operations should only ever be manipulations of the diff --git a/modin/experimental/core/storage_formats/pandas/native_query_compiler.py b/modin/experimental/core/storage_formats/pandas/native_query_compiler.py index 8679dc318dc..f010207adbc 100644 --- a/modin/experimental/core/storage_formats/pandas/native_query_compiler.py +++ b/modin/experimental/core/storage_formats/pandas/native_query_compiler.py @@ -294,8 +294,6 @@ def groupby_callable( groupby_obj = df.groupby(by=by, axis=axis, **groupby_kwargs) if agg_name == "agg": if isinstance(agg_func, dict): - # Related to pandas issue when dict with list of funcs as value is passed in agg_func - # https://github.com/pandas-dev/pandas/issues/39103 agg_func = { k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in agg_func.items() @@ -314,12 +312,6 @@ def groupby_callable( return groupby_callable -def _take_2d(df, index=None, columns=None): # noqa: GL08 - columns = columns if columns is not None else slice(None) - index = index if index is not None else slice(None) - return df.iloc[index, columns] - - def _register_binary(op): """ Build function that apply specified binary method of the passed frame. @@ -346,7 +338,6 @@ def binary_operator(df, other, **kwargs): if squeeze_self: df = df.squeeze(axis=1) - result = getattr(df, op)(other, **kwargs) if ( not isinstance(result, pandas.Series) @@ -727,6 +718,7 @@ def setitem_bool(self, row_loc, col_loc, item): pandas.DataFrame.update, in_place=True, df_copy=True ) diff = _register_default_pandas(pandas.DataFrame.diff) + dot = _register_default_pandas(_register_binary("dot")) drop = _register_default_pandas(_drop) dropna = _register_default_pandas(pandas.DataFrame.dropna) # axis values switched? dt_ceil = _register_default_pandas(_dt_func_map("ceil")) @@ -859,7 +851,6 @@ def setitem_bool(self, row_loc, col_loc, item): groupby_quantile = _register_default_pandas(_groupby("quantile")) groupby_rank = _register_default_pandas(_groupby("rank")) groupby_shift = _register_default_pandas(_groupby("shift")) - groupby_size = _register_default_pandas(_groupby("size")) groupby_skew = _register_default_pandas(_groupby("skew")) groupby_std = _register_default_pandas(_groupby("std")) groupby_sum = _register_default_pandas(_groupby("sum")) @@ -988,9 +979,6 @@ def setitem_bool(self, row_loc, col_loc, item): rtruediv = _register_default_pandas(_register_binary("rtruediv")) searchsorted = _register_default_pandas(pandas.Series.searchsorted, is_series=True) sem = _register_default_pandas(pandas.DataFrame.sem) - series_update = _register_default_pandas( - pandas.Series.update, is_series=True, in_place=True, df_copy=True - ) series_view = _register_default_pandas(pandas.Series.view, is_series=True) set_index_from_columns = _register_default_pandas(pandas.DataFrame.set_index) setitem = _register_default_pandas(_setitem) @@ -1054,7 +1042,6 @@ def setitem_bool(self, row_loc, col_loc, item): sub = _register_default_pandas(_register_binary("sub")) sum = _register_default_pandas(pandas.DataFrame.sum) sum_min_count = _register_default_pandas(pandas.DataFrame.sum) - take_2d = _register_default_pandas(_take_2d) to_datetime = _register_default_pandas(_to_datetime) to_numeric = _register_default_pandas(_to_numeric) to_numpy = _register_default_pandas(pandas.DataFrame.to_numpy, return_modin=False) @@ -1094,24 +1081,14 @@ def describe(self, percentiles: np.ndarray): include="all", ) - def dot(self, other, squeeze_self=None, squeeze_other=None): - other = try_cast_to_pandas(other) - if squeeze_other: - other = other.squeeze() - if squeeze_self: - result = self._modin_frame.squeeze(axis=1).dot(other) - else: - result = self._modin_frame.dot(other) - if isinstance(result, pandas.Series): - if result.name is None: - result.name = "__reduced__" - result = result.to_frame() - if is_list_like(result): - result = pandas.DataFrame(result) - else: - result = pandas.DataFrame([result]) - - return self.__constructor__(result) + def series_update(self, other, **kwargs): + return _register_default_pandas(_register_binary("update"), in_place=True)( + self, + other=other, + squeeze_self=True, + squeeze_other=True, + **kwargs, + ) def expanding_cov( self, @@ -1134,8 +1111,6 @@ def expanding_cov( else other.to_pandas() ) ) - # expanding_rank = _register_default_pandas(_register_expanding(pandas.core.window.expanding.Expanding.rank)) - return _register_default_pandas( _register_expanding(pandas.core.window.expanding.Expanding.cov) )( @@ -1185,6 +1160,31 @@ def expanding_corr( **kwargs, ) + def groupby_size( + self, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + ): + result = _register_default_pandas(_groupby("size"))( + self, + by=by, + axis=axis, + groupby_kwargs=groupby_kwargs, + agg_args=agg_args, + agg_kwargs=agg_kwargs, + drop=drop, + method="size", + ) + if not groupby_kwargs.get("as_index", False): + # Renaming 'MODIN_UNNAMED_SERIES_LABEL' to a proper name + + result.columns = result.columns[:-1].append(pandas.Index(["size"])) + return result + def get_axis(self, axis): return self._modin_frame.index if axis == 0 else self._modin_frame.columns diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 4df41168b5a..7818c52654d 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -144,6 +144,7 @@ def __init__( name = MODIN_UNNAMED_SERIES_LABEL if isinstance(data, pandas.Series) and data.name is not None: name = data.name + query_compiler = from_pandas( pandas.DataFrame( pandas.Series( diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 016ee2e7ac9..a794aeb6446 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -11,7 +11,6 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -import contextlib import io import warnings @@ -88,11 +87,7 @@ ) def test_ops_defaulting_to_pandas(op, make_args): modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"], axis=1) - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): operation = getattr(modin_df, op) if make_args is not None: operation(**make_args(modin_df)) @@ -106,11 +101,7 @@ def test_ops_defaulting_to_pandas(op, make_args): def test_style(): data = test_data_values[0] - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): pd.DataFrame(data).style @@ -118,11 +109,7 @@ def test_to_timestamp(): idx = pd.date_range("1/1/2012", periods=5, freq="M") df = pd.DataFrame(np.random.randint(0, 100, size=(len(idx), 4)), index=idx) - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): df.to_period().to_timestamp() @@ -151,11 +138,7 @@ def test_asfreq(): index = pd.date_range("1/1/2000", periods=4, freq="min") series = pd.Series([0.0, None, 2.0, 3.0], index=index) df = pd.DataFrame({"s": series}) - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): # We are only testing that this defaults to pandas, so we will just check for # the warning df.asfreq(freq="30S") diff --git a/modin/tests/pandas/dataframe/test_join_sort.py b/modin/tests/pandas/dataframe/test_join_sort.py index 46983ffd45c..cebe1194c6c 100644 --- a/modin/tests/pandas/dataframe/test_join_sort.py +++ b/modin/tests/pandas/dataframe/test_join_sort.py @@ -11,7 +11,6 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -import contextlib import warnings import matplotlib @@ -611,11 +610,7 @@ def test_sort_multiindex(sort_remaining): setattr(df, index, new_index) for kwargs in [{"level": 0}, {"axis": 0}, {"axis": 1}]: - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): df_equals( modin_df.sort_index(sort_remaining=sort_remaining, **kwargs), pandas_df.sort_index(sort_remaining=sort_remaining, **kwargs), diff --git a/modin/tests/pandas/test_expanding.py b/modin/tests/pandas/test_expanding.py index d96a38bc21e..fe184dbd249 100644 --- a/modin/tests/pandas/test_expanding.py +++ b/modin/tests/pandas/test_expanding.py @@ -11,14 +11,12 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -import contextlib - import numpy as np import pandas import pytest import modin.pandas as pd -from modin.config import NativeDataframeMode, NPartitions +from modin.config import NPartitions from modin.tests.test_utils import warns_that_defaulting_to_pandas from .utils import ( @@ -69,11 +67,7 @@ def test_dataframe(data, min_periods, axis, method, kwargs): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("method", ["corr", "cov"]) def test_dataframe_corr_cov(data, min_periods, axis, method): - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): eval_general( *create_test_dfs(data), lambda df: getattr( @@ -85,11 +79,7 @@ def test_dataframe_corr_cov(data, min_periods, axis, method): @pytest.mark.parametrize("method", ["corr", "cov"]) def test_dataframe_corr_cov_with_self(method): mdf, pdf = create_test_dfs(test_data["float_nan_data"]) - with ( - warns_that_defaulting_to_pandas() - if not NativeDataframeMode.get() - else contextlib.nullcontext() - ): + with warns_that_defaulting_to_pandas(): eval_general( mdf, pdf, diff --git a/modin/tests/test_utils.py b/modin/tests/test_utils.py index bc478d957f9..9e5589314cc 100644 --- a/modin/tests/test_utils.py +++ b/modin/tests/test_utils.py @@ -11,6 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +import contextlib import json from textwrap import dedent, indent from unittest.mock import Mock, patch @@ -21,6 +22,7 @@ import modin.pandas as pd import modin.utils +from modin.config import NativeDataframeMode from modin.error_message import ErrorMessage from modin.tests.pandas.utils import create_test_dfs @@ -263,10 +265,17 @@ def warns_that_defaulting_to_pandas(prefix=None, suffix=None): Returns ------- - pytest.recwarn.WarningsChecker - A WarningsChecker checking for a UserWarning saying that Modin is - defaulting to Pandas. + pytest.recwarn.WarningsChecker or contextlib.nullcontext + If Modin is not operating in MODIN_NATIVE_DATAFRAME_MODE,a WarningsChecker + is returned whic will check for a UserWarning indicating that Modin + is defaulting to Pandas. If MODIN_NATIVE_DATAFRAME_MODE is set, a + nullcontext is returned to avoid warning about the default to Pandas, + as this occurs due user selecting of MODIN_NATIVE_DATAFRAME_MODE. + """ + if NativeDataframeMode.get(): + return contextlib.nullcontext() + match = "[Dd]efaulting to pandas" if prefix: # Message may be separated by newlines From e8925cb0a2aef81a7830b94631835082bf3cca52 Mon Sep 17 00:00:00 2001 From: Arun Jose <40291569+arunjose696@users.noreply.github.com> Date: Wed, 12 Jun 2024 11:30:03 +0200 Subject: [PATCH 16/19] Apply suggestions from code review Co-authored-by: Iaroslav Igoshev Signed-off-by: arunjose696 --- .github/workflows/ci.yml | 4 +- modin/config/envvars.py | 21 ++-- .../dispatching/factories/factories.py | 6 +- .../pandas/native_query_compiler.py | 109 ++++++++---------- modin/tests/pandas/dataframe/test_binary.py | 2 +- modin/tests/pandas/dataframe/test_default.py | 13 ++- modin/tests/pandas/dataframe/test_indexing.py | 8 +- modin/tests/pandas/dataframe/test_iter.py | 5 +- .../tests/pandas/dataframe/test_join_sort.py | 6 +- .../pandas/dataframe/test_map_metadata.py | 11 +- modin/tests/test_utils.py | 17 +-- 11 files changed, 98 insertions(+), 104 deletions(-) rename modin/{experimental => }/core/storage_formats/pandas/native_query_compiler.py (94%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3125a1d094e..5077323df9d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -222,7 +222,7 @@ jobs: with: filters: | test-small-query-compiler: - - 'modin/experimental/core/storage_formats/pandas/small_query_compiler.py' + - 'modin/experimental/core/storage_formats/pandas/native_query_compiler.py' - 'modin/core/storage_formats/pandas/query_compiler.py' - 'modin/core/storage_formats/base/query_compiler.py' shared: &shared @@ -647,7 +647,7 @@ jobs: matrix: python-version: ["3.9"] env: - MODIN_NATIVE_DATAFRAME_MODE: "Native_pandas" + MODIN_NATIVE_DATAFRAME_MODE: "Pandas" name: test-small-query-compiler python ${{matrix.python-version}}) steps: - uses: actions/checkout@v4 diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 1d3f4260f8f..59f3dab1d03 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -915,20 +915,19 @@ def _check_vars() -> None: class NativeDataframeMode(EnvironmentVariable, type=str): """ - The mode of execution used for handling dataframes in Modin. - - When the env variable is set to None the PandasQueryCompiler would be used - which would lead to Modin executing dataframes in distributed fashion. - When set to Native_pandas NativeQueryCompiler is used which handles the - dataframes without distributing, falling back to native pandas functions. - - In future more execution modes can be added for single node execution so - keeping the parameter as string. + When this config is set to ``Default``, ``PandasQueryCompiler`` is used, + which leads to Modin executing dataframes in distributed fashion. + When set to a string (e.g., ``Pandas``), ``NativeQueryCompiler`` is used, + which handles the dataframes without distributing, + falling back to native library functions (e.g., ``Pandas``). + + This could be beneficial for handling relatively small dataframes + without involving additional overhead of communication between processes. """ varname = "MODIN_NATIVE_DATAFRAME_MODE" - choices = ("Native_pandas",) - default = None + choices = ("Pandas",) + default = "Default" _check_vars() diff --git a/modin/core/execution/dispatching/factories/factories.py b/modin/core/execution/dispatching/factories/factories.py index ee1b68b2dee..deda5113287 100644 --- a/modin/core/execution/dispatching/factories/factories.py +++ b/modin/core/execution/dispatching/factories/factories.py @@ -28,9 +28,7 @@ from modin.config import NativeDataframeMode from modin.core.io import BaseIO -from modin.experimental.core.storage_formats.pandas.native_query_compiler import ( - NativeQueryCompiler, -) +from modin.core.storage_formats.pandas.native_query_compiler import NativeQueryCompiler from modin.utils import get_current_execution _doc_abstract_factory_class = """ @@ -172,7 +170,7 @@ def prepare(cls): method="io.from_pandas", ) def _from_pandas(cls, df): - if NativeDataframeMode.get(): + if NativeDataframeMode.get() == "Pandas": df_copy = df.copy() return NativeQueryCompiler(df_copy) return cls.io_cls.from_pandas(df) diff --git a/modin/experimental/core/storage_formats/pandas/native_query_compiler.py b/modin/core/storage_formats/pandas/native_query_compiler.py similarity index 94% rename from modin/experimental/core/storage_formats/pandas/native_query_compiler.py rename to modin/core/storage_formats/pandas/native_query_compiler.py index f010207adbc..bd89ec4775e 100644 --- a/modin/experimental/core/storage_formats/pandas/native_query_compiler.py +++ b/modin/core/storage_formats/pandas/native_query_compiler.py @@ -406,8 +406,6 @@ def _fillna(df, value, **kwargs): # noqa: GL08 df = df.squeeze(axis=1) if squeeze_value and isinstance(value, pandas.DataFrame): value = value.squeeze(axis=1) - # if len(df.columns) == 1 and df.columns[0] == "__reduced__": - # df = df["__reduced__"] return df.fillna(value, **kwargs) @@ -495,12 +493,10 @@ def _get_dummies(df, columns, **kwargs): # noqa: GL08 def _register_default_pandas( func, is_series=False, - squeeze_series=False, squeeze_args=False, squeeze_kwargs=False, - return_modin=True, + return_raw=False, in_place=False, - df_copy=False, filter_kwargs=[], ): """ @@ -512,18 +508,14 @@ def _register_default_pandas( Function to apply. is_series : bool, default: False If True, the passed frame will always be squeezed to a series. - squeeze_series : bool, default: False - If True, the passed frame will always be squeezed to a series if there is a single column named "__reduced__". squeeze_args : bool, default: False If True, all passed arguments will be squeezed. squeeze_kwargs : bool, default: False If True, all passed key word arguments will be squeezed. - return_modin : bool, default: True - If True, the result will always try to convert to DataFrame or Series. + return_raw : bool, default: False + If True, and the result not DataFrame or Series it is returned as is without wrapping in query compiler. in_place : bool, default: False If True, the specified function will be applied on the passed frame in place. - df_copy : bool, default: False - If True, the specified function will be applied to a copy of the passed frame. filter_kwargs : list, default: [] List of key word argument names to remove. @@ -535,17 +527,9 @@ def _register_default_pandas( def caller(query_compiler, *args, **kwargs): df = query_compiler._modin_frame - if df_copy: - df = df.copy() if is_series: df = df.squeeze(axis=1) - exclude_names = [ - # "broadcast", - "fold_axis", - # "squeeze_self", - # "squeeze_value", - "ignore_indices", - ] + filter_kwargs + exclude_names = ["fold_axis"] + filter_kwargs kwargs = kwargs.copy() for name in exclude_names: kwargs.pop(name, None) @@ -553,12 +537,11 @@ def caller(query_compiler, *args, **kwargs): kwargs = try_cast_to_pandas(kwargs, squeeze=squeeze_kwargs) result = func(df, *args, **kwargs) inplace_method = kwargs.get("inplace", False) - if in_place: inplace_method = in_place if inplace_method: result = df - if not (return_modin or isinstance(result, (pandas.Series, pandas.DataFrame))): + if return_raw and not isinstance(result, (pandas.Series, pandas.DataFrame)): return result if isinstance(result, pandas.Series): if result.name is None: @@ -576,8 +559,8 @@ class NativeQueryCompiler(BaseQueryCompiler): Query compiler for the pandas storage format. This class translates common query compiler API into - plain pandas to execute operations on small data - depending on the threshold. + native library functions (e.g., pandas) to execute operations + on small data depending on the threshold. Parameters ---------- @@ -585,8 +568,11 @@ class NativeQueryCompiler(BaseQueryCompiler): Pandas frame to query with the compiled queries. """ - def __init__(self, pandas_frame): - assert NativeDataframeMode.get() == "Native_Pandas" + _modin_frame: pandas.DataFrame + _shape_hint: Optional[str] + + def __init__(self, pandas_frame, shape_hint: Optional[str] = None): + assert NativeDataframeMode.get() == "Pandas" if hasattr(pandas_frame, "_to_pandas"): pandas_frame = pandas_frame._to_pandas() if is_scalar(pandas_frame): @@ -595,6 +581,7 @@ def __init__(self, pandas_frame): pandas_frame = pandas.DataFrame(pandas_frame) self._modin_frame = pandas_frame + self._shape_hint = shape_hint def execute(self): pass @@ -617,6 +604,10 @@ def set_frame_dtypes_cache(self, dtypes): Parameters ---------- dtypes : pandas.Series, ModinDtypes, callable or None + + Notes + ----- + This function is for consistency with other QCs, dtypes should be assigned directly on the frame. """ pass @@ -627,6 +618,10 @@ def set_frame_index_cache(self, index): Parameters ---------- index : sequence, callable or None + + Notes + ----- + This function is for consistency with other QCs, dtypes should be assigned directly on the frame. """ pass @@ -665,27 +660,25 @@ def setitem_bool(self, row_loc, col_loc, item): self._modin_frame.loc[row_loc._modin_frame.squeeze(axis=1), col_loc] = item return self.__constructor__(self._modin_frame) - __and__ = _register_default_pandas(pandas.DataFrame.__and__, squeeze_series=True) + __and__ = _register_default_pandas(pandas.DataFrame.__and__) __dir__ = _register_default_pandas(pandas.DataFrame.__dir__) - __eq__ = _register_default_pandas(pandas.DataFrame.__eq__, squeeze_series=True) + __eq__ = _register_default_pandas(pandas.DataFrame.__eq__) __format__ = _register_default_pandas(pandas.DataFrame.__format__) - __ge__ = _register_default_pandas(pandas.DataFrame.__ge__, squeeze_series=True) - __gt__ = _register_default_pandas(pandas.DataFrame.__gt__, squeeze_series=True) - __le__ = _register_default_pandas(pandas.DataFrame.__le__, squeeze_series=True) - __lt__ = _register_default_pandas(pandas.DataFrame.__lt__, squeeze_series=True) - __ne__ = _register_default_pandas(pandas.DataFrame.__ne__, squeeze_series=True) - __or__ = _register_default_pandas(pandas.DataFrame.__or__, squeeze_series=True) - __rand__ = _register_default_pandas(pandas.DataFrame.__rand__, squeeze_series=True) - __reduce__ = _register_default_pandas( - pandas.DataFrame.__reduce__, return_modin=False - ) + __ge__ = _register_default_pandas(pandas.DataFrame.__ge__) + __gt__ = _register_default_pandas(pandas.DataFrame.__gt__) + __le__ = _register_default_pandas(pandas.DataFrame.__le__) + __lt__ = _register_default_pandas(pandas.DataFrame.__lt__) + __ne__ = _register_default_pandas(pandas.DataFrame.__ne__) + __or__ = _register_default_pandas(pandas.DataFrame.__or__) + __rand__ = _register_default_pandas(pandas.DataFrame.__rand__) + __reduce__ = _register_default_pandas(pandas.DataFrame.__reduce__, return_raw=True) __reduce_ex__ = _register_default_pandas( - pandas.DataFrame.__reduce_ex__, return_modin=False + pandas.DataFrame.__reduce_ex__, return_raw=True ) - __ror__ = _register_default_pandas(pandas.DataFrame.__ror__, squeeze_series=True) - __rxor__ = _register_default_pandas(pandas.DataFrame.__rxor__, squeeze_series=True) + __ror__ = _register_default_pandas(pandas.DataFrame.__ror__) + __rxor__ = _register_default_pandas(pandas.DataFrame.__rxor__) __sizeof__ = _register_default_pandas(pandas.DataFrame.__sizeof__) - __xor__ = _register_default_pandas(pandas.DataFrame.__xor__, squeeze_series=True) + __xor__ = _register_default_pandas(pandas.DataFrame.__xor__) abs = _register_default_pandas(pandas.DataFrame.abs) add = _register_default_pandas(_register_binary("add")) all = _register_default_pandas(pandas.DataFrame.all) @@ -696,10 +689,8 @@ def setitem_bool(self, row_loc, col_loc, item): astype = _register_default_pandas(pandas.DataFrame.astype) case_when = _register_default_pandas(pandas.Series.case_when) cat_codes = _register_default_pandas(lambda ser: ser.cat.codes, is_series=True) - combine = _register_default_pandas(_combine, squeeze_series=True) - combine_first = _register_default_pandas( - lambda df, other: df.combine_first(other), squeeze_series=True - ) + combine = _register_default_pandas(_combine) + combine_first = _register_default_pandas(lambda df, other: df.combine_first(other)) compare = _register_default_pandas(pandas.DataFrame.compare) concat = _register_default_pandas(_concat) conj = _register_default_pandas( @@ -714,9 +705,7 @@ def setitem_bool(self, row_loc, col_loc, item): cumprod = _register_default_pandas(pandas.DataFrame.cumprod) cumsum = _register_default_pandas(pandas.DataFrame.cumsum) delitem = _register_default_pandas(_delitem) - df_update = _register_default_pandas( - pandas.DataFrame.update, in_place=True, df_copy=True - ) + df_update = _register_default_pandas(pandas.DataFrame.update, in_place=True) diff = _register_default_pandas(pandas.DataFrame.diff) dot = _register_default_pandas(_register_binary("dot")) drop = _register_default_pandas(_drop) @@ -825,7 +814,7 @@ def setitem_bool(self, row_loc, col_loc, item): fillna = _register_default_pandas(_fillna) first_valid_index = _register_default_pandas( - pandas.DataFrame.first_valid_index, return_modin=False + pandas.DataFrame.first_valid_index, return_raw=True ) floordiv = _register_default_pandas(_register_binary("floordiv")) ge = _register_default_pandas(_register_binary("ge"), filter_kwargs=["dtypes"]) @@ -859,7 +848,7 @@ def setitem_bool(self, row_loc, col_loc, item): idxmax = _register_default_pandas(pandas.DataFrame.idxmax) idxmin = _register_default_pandas(pandas.DataFrame.idxmin) infer_objects = _register_default_pandas( - pandas.DataFrame.infer_objects, return_modin=False + pandas.DataFrame.infer_objects, return_raw=True ) insert = _register_default_pandas( pandas.DataFrame.insert, in_place=True, squeeze_args=True @@ -876,9 +865,9 @@ def setitem_bool(self, row_loc, col_loc, item): ) isna = _register_default_pandas(pandas.DataFrame.isna) join = _register_default_pandas(pandas.DataFrame.join) - kurt = _register_default_pandas(pandas.DataFrame.kurt, return_modin=False) + kurt = _register_default_pandas(pandas.DataFrame.kurt, return_raw=True) last_valid_index = _register_default_pandas( - pandas.DataFrame.last_valid_index, return_modin=False + pandas.DataFrame.last_valid_index, return_raw=True ) le = _register_default_pandas(_register_binary("le"), filter_kwargs=["dtypes"]) lt = _register_default_pandas(_register_binary("lt"), filter_kwargs=["dtypes"]) @@ -886,8 +875,8 @@ def setitem_bool(self, row_loc, col_loc, item): mask = _register_default_pandas(pandas.DataFrame.mask) max = _register_default_pandas(pandas.DataFrame.max) map = _register_default_pandas(pandas.DataFrame.map) - mean = _register_default_pandas(pandas.DataFrame.mean, return_modin=False) - median = _register_default_pandas(pandas.DataFrame.median, return_modin=False) + mean = _register_default_pandas(pandas.DataFrame.mean, return_raw=True) + median = _register_default_pandas(pandas.DataFrame.median, return_raw=True) melt = _register_default_pandas(pandas.DataFrame.melt) memory_usage = _register_default_pandas(pandas.DataFrame.memory_usage) merge = _register_default_pandas(pandas.DataFrame.merge) @@ -899,9 +888,7 @@ def setitem_bool(self, row_loc, col_loc, item): negative = _register_default_pandas(pandas.DataFrame.__neg__) nlargest = _register_default_pandas(pandas.DataFrame.nlargest) notna = _register_default_pandas(pandas.DataFrame.notna) - nsmallest = _register_default_pandas( - lambda df, **kwargs: df.nsmallest(**kwargs), squeeze_series=True - ) + nsmallest = _register_default_pandas(lambda df, **kwargs: df.nsmallest(**kwargs)) nunique = _register_default_pandas(pandas.DataFrame.nunique) pivot = _register_default_pandas(pandas.DataFrame.pivot) pivot_table = _register_default_pandas(pandas.DataFrame.pivot_table) @@ -982,7 +969,7 @@ def setitem_bool(self, row_loc, col_loc, item): series_view = _register_default_pandas(pandas.Series.view, is_series=True) set_index_from_columns = _register_default_pandas(pandas.DataFrame.set_index) setitem = _register_default_pandas(_setitem) - skew = _register_default_pandas(pandas.DataFrame.skew, return_modin=False) + skew = _register_default_pandas(pandas.DataFrame.skew, return_raw=True) sort_index = _register_default_pandas(_sort_index) sort_columns_by_row_values = _register_default_pandas( lambda df, columns, **kwargs: df.sort_values(by=columns, axis=1, **kwargs) @@ -1044,13 +1031,13 @@ def setitem_bool(self, row_loc, col_loc, item): sum_min_count = _register_default_pandas(pandas.DataFrame.sum) to_datetime = _register_default_pandas(_to_datetime) to_numeric = _register_default_pandas(_to_numeric) - to_numpy = _register_default_pandas(pandas.DataFrame.to_numpy, return_modin=False) + to_numpy = _register_default_pandas(pandas.DataFrame.to_numpy, return_raw=True) to_timedelta = _register_default_pandas( lambda ser, *args, **kwargs: pandas.to_timedelta(ser, *args, **kwargs), is_series=True, ) transpose = _register_default_pandas(pandas.DataFrame.transpose) - truediv = _register_default_pandas(_register_binary("truediv"), squeeze_series=True) + truediv = _register_default_pandas(_register_binary("truediv")) unstack = _register_default_pandas(pandas.DataFrame.unstack) var = _register_default_pandas(pandas.DataFrame.var) where = _register_default_pandas(pandas.DataFrame.where) diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index 1b643cfcdba..2f614d5958d 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -211,7 +211,7 @@ def operation(df): reason="Modin on this engine doesn't create virtual partitions.", ) @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler does not contain partitions.", ) @pytest.mark.parametrize( diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index a794aeb6446..da6c034d674 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -124,7 +124,7 @@ def test_to_numpy(data): @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler does not contain partitions.", ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -298,7 +298,7 @@ def test_corr_min_periods(self, min_periods): {"a": [1, np.nan, 3, 4, 5, 6], "b": [1, 2, 1, 4, 5, np.nan]} ) modin_df = pd.concat([modin_df.iloc[:3], modin_df.iloc[3:]]) - if not NativeDataframeMode.get(): + if NativeDataframeMode.get() == "Default": assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( modin_df, pandas_df, lambda df: df.corr(min_periods=min_periods) @@ -318,7 +318,7 @@ def test_corr_non_numeric(self, numeric_only): reason="doesn't make sense for non-partitioned executions", ) @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler does not contain partitions.", ) def test_corr_nans_in_different_partitions(self): @@ -611,7 +611,10 @@ def test_pivot(data, index, columns, values, request): or "default-one_column-several_columns_index" in request.node.callspec.id or "default-one_column-one_column_index" in request.node.callspec.id or ( - (current_execution in ("BaseOnPython",) or NativeDataframeMode.get()) + ( + current_execution in ("BaseOnPython",) + or NativeDataframeMode.get() == "Pandas" + ) and index is lib.no_default ) ): @@ -992,7 +995,7 @@ def test_resampler_functions_with_arg(rule, axis, method_arg): marks=pytest.mark.xfail( condition=Engine.get() in ("Ray", "Unidist", "Dask", "Python") and StorageFormat.get() != "Base" - and NativeDataframeMode.get() is None, + and NativeDataframeMode.get() == "Default", reason="https://github.com/modin-project/modin/issues/6399", ), ), diff --git a/modin/tests/pandas/dataframe/test_indexing.py b/modin/tests/pandas/dataframe/test_indexing.py index 0f38eaa5ebe..935b49cd318 100644 --- a/modin/tests/pandas/dataframe/test_indexing.py +++ b/modin/tests/pandas/dataframe/test_indexing.py @@ -67,6 +67,7 @@ def eval_setitem(md_df, pd_df, value, col=None, loc=None, expected_exception=Non col = pd_df.columns[loc] value_getter = value if callable(value) else (lambda *args, **kwargs: value) + eval_general( md_df, pd_df, @@ -83,6 +84,7 @@ def eval_loc(md_df, pd_df, value, key): md_value, pd_value = value else: md_value, pd_value = value, value + eval_general( md_df, pd_df, @@ -525,6 +527,7 @@ def test_loc_4456( if reverse_value_columns: pdf_value = pdf_value.reindex(columns=pdf_value.columns[::-1]) mdf_value = mdf_value.reindex(columns=mdf_value.columns[::-1]) + eval_loc(modin_df, pandas_df, pdf_value, key) eval_loc(modin_df, pandas_df, (mdf_value, pdf_value), key) @@ -588,7 +591,7 @@ def test_loc_setting_single_categorical_column(): @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler does not currently support IO functions.", ) def test_loc_multi_index(): @@ -1487,7 +1490,6 @@ def test_reset_index(data, test_async_reset_index): pd_df_cp = pandas_df.copy() if test_async_reset_index: modin_df._query_compiler.set_frame_index_cache(None) - modin_df_cp.reset_index(inplace=True) pd_df_cp.reset_index(inplace=True) df_equals(modin_df_cp, pd_df_cp) @@ -2245,7 +2247,7 @@ def test___setitem__partitions_aligning(): @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler does not currently support IO functions.", ) def test___setitem__with_mismatched_partitions(): diff --git a/modin/tests/pandas/dataframe/test_iter.py b/modin/tests/pandas/dataframe/test_iter.py index b00ae056920..ccd6e632d10 100644 --- a/modin/tests/pandas/dataframe/test_iter.py +++ b/modin/tests/pandas/dataframe/test_iter.py @@ -142,7 +142,8 @@ def test_display_options_for___repr__(max_rows_columns, expand_frame_repr, frame def test___finalize__(): data = test_data_values[0] - with warns_that_defaulting_to_pandas(): + # Using force for warns_that_defaulting_to_pandas as the warnings are raised in Dataframe layer, before geting into QueryCompiler layer. + with warns_that_defaulting_to_pandas(force=True): pd.DataFrame(data).__finalize__(None) @@ -230,7 +231,7 @@ def test___repr__(): "2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0 "2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0""" pandas_df = pandas.read_csv(io.StringIO(string_data)) - with warns_that_defaulting_to_pandas(): + with warns_that_defaulting_to_pandas(force=True): modin_df = pd.read_csv(io.StringIO(string_data)) assert repr(pandas_df) == repr(modin_df) diff --git a/modin/tests/pandas/dataframe/test_join_sort.py b/modin/tests/pandas/dataframe/test_join_sort.py index cebe1194c6c..06ee419e6ec 100644 --- a/modin/tests/pandas/dataframe/test_join_sort.py +++ b/modin/tests/pandas/dataframe/test_join_sort.py @@ -732,7 +732,7 @@ def test_sort_values_descending_with_only_two_bins(): modin_df = pd.concat([part1, part2]) pandas_df = modin_df._to_pandas() - if StorageFormat.get() == "Pandas" and not NativeDataframeMode.get(): + if StorageFormat.get() == "Pandas" and NativeDataframeMode.get() == "Default": assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( @@ -772,7 +772,7 @@ def test_sort_values_with_one_partition(ascending): np.array([["hello", "goodbye"], ["hello", "Hello"]]) ) - if StorageFormat.get() == "Pandas" and not NativeDataframeMode.get(): + if StorageFormat.get() == "Pandas" and NativeDataframeMode.get() == "Default": assert modin_df._query_compiler._modin_frame._partitions.shape == (1, 1) eval_general( @@ -893,7 +893,7 @@ def test_sort_values_with_only_one_non_na_row_in_partition(ascending, na_positio @pytest.mark.skipif( Engine.get() not in ("Ray", "Unidist", "Dask") - or NativeDataframeMode.get() is not None, + or NativeDataframeMode.get() == "Pandas", reason="We only need to test this case where sort does not default to pandas.", ) def test_sort_values_with_sort_key_on_partition_boundary(): diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index 40c910ed4cc..cc3d6753ea0 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -304,7 +304,10 @@ def test_copy(data): assert new_modin_df.columns is not modin_df.columns assert new_modin_df.dtypes is not modin_df.dtypes - if get_current_execution() != "BaseOnPython" and not NativeDataframeMode.get(): + if ( + get_current_execution() != "BaseOnPython" + and NativeDataframeMode.get() == "Default" + ): assert np.array_equal( new_modin_df._query_compiler._modin_frame._partitions, modin_df._query_compiler._modin_frame._partitions, @@ -571,7 +574,7 @@ def test_astype_int64_to_astype_category_github_issue_6259(): reason="BaseOnPython doesn't have proxy categories", ) @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler doesn't have proxy categories", ) class TestCategoricalProxyDtype: @@ -797,7 +800,7 @@ def comparator(df1, df2): @pytest.mark.skipif( - NativeDataframeMode.get() is not None, + NativeDataframeMode.get() == "Pandas", reason="NativeQueryCompiler does not contain partitions.", ) def test_convert_dtypes_multiple_row_partitions(): @@ -824,7 +827,7 @@ def test_convert_dtypes_5653(): modin_part1 = pd.DataFrame({"col1": ["a", "b", "c", "d"]}) modin_part2 = pd.DataFrame({"col1": [None, None, None, None]}) modin_df = pd.concat([modin_part1, modin_part2]) - if StorageFormat.get() == "Pandas" and not NativeDataframeMode.get(): + if StorageFormat.get() == "Pandas" and NativeDataframeMode.get() == "Default": assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) modin_df = modin_df.convert_dtypes() assert len(modin_df.dtypes) == 1 diff --git a/modin/tests/test_utils.py b/modin/tests/test_utils.py index 9e5589314cc..55075f3a743 100644 --- a/modin/tests/test_utils.py +++ b/modin/tests/test_utils.py @@ -250,7 +250,7 @@ def test_format_string(): assert answer == expected -def warns_that_defaulting_to_pandas(prefix=None, suffix=None): +def warns_that_defaulting_to_pandas(prefix=None, suffix=None, force=False): """ Assert that code warns that it's defaulting to pandas. @@ -262,18 +262,19 @@ def warns_that_defaulting_to_pandas(prefix=None, suffix=None): suffix : Optional[str] If specified, checks that the end of the warning message matches this argument after "[Dd]efaulting to pandas". + force : Optional[bool] + If true return the pytest.recwarn.WarningsChecker irrespective of ``NativeDataframeMode`` Returns ------- pytest.recwarn.WarningsChecker or contextlib.nullcontext - If Modin is not operating in MODIN_NATIVE_DATAFRAME_MODE,a WarningsChecker - is returned whic will check for a UserWarning indicating that Modin - is defaulting to Pandas. If MODIN_NATIVE_DATAFRAME_MODE is set, a - nullcontext is returned to avoid warning about the default to Pandas, - as this occurs due user selecting of MODIN_NATIVE_DATAFRAME_MODE. - + If Modin is not operating in ``NativeDataframeMode``, a ``WarningsChecker`` + is returned, which will check for a ``UserWarning`` indicating that Modin + is defaulting to Pandas. If ``NativeDataframeMode`` is set, a + ``nullcontext`` is returned to avoid the warning about defaulting to Pandas, + as this occurs due to user setting of ``NativeDataframeMode``. """ - if NativeDataframeMode.get(): + if NativeDataframeMode.get() == "Pandas" and not force: return contextlib.nullcontext() match = "[Dd]efaulting to pandas" From 3585c74df0d4349910fc4dfb5c1a6a3ca0bc82d2 Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Mon, 24 Jun 2024 01:25:13 -0500 Subject: [PATCH 17/19] fix conflict --- .github/workflows/ci.yml | 2 +- modin/tests/pandas/dataframe/test_indexing.py | 6 +- versioneer.py | 400 +++++------------- 3 files changed, 97 insertions(+), 311 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5077323df9d..08da1d664c0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -222,7 +222,7 @@ jobs: with: filters: | test-small-query-compiler: - - 'modin/experimental/core/storage_formats/pandas/native_query_compiler.py' + - 'modin/core/storage_formats/pandas/native_query_compiler.py' - 'modin/core/storage_formats/pandas/query_compiler.py' - 'modin/core/storage_formats/base/query_compiler.py' shared: &shared diff --git a/modin/tests/pandas/dataframe/test_indexing.py b/modin/tests/pandas/dataframe/test_indexing.py index 935b49cd318..a47474eb76c 100644 --- a/modin/tests/pandas/dataframe/test_indexing.py +++ b/modin/tests/pandas/dataframe/test_indexing.py @@ -21,11 +21,7 @@ from pandas._testing import ensure_clean import modin.pandas as pd -<<<<<<< HEAD -from modin.config import MinRowPartitionSize, NPartitions, UsePlainPandasQueryCompiler -======= -from modin.config import MinPartitionSize, NativeDataframeMode, NPartitions ->>>>>>> 1984aa1f (renaming to PlainPandasQueryCompiler to NativeDataframeMode) +from modin.config import MinRowPartitionSize, NativeDataframeMode, NPartitions from modin.pandas.indexing import is_range_like from modin.pandas.testing import assert_index_equal from modin.tests.pandas.utils import ( diff --git a/versioneer.py b/versioneer.py index 71109f05c02..0ae83dbaaf9 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1,12 +1,4 @@ -<<<<<<< HEAD -<<<<<<< HEAD # Version: 0.29 -======= -======= - ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) -# Version: 0.18 ->>>>>>> 4e967422 (fixing tests) """The Versioneer - like a rocketeer, but for versions. @@ -368,16 +360,11 @@ def get_root() -> str: setup_py = os.path.join(root, "setup.py") pyproject_toml = os.path.join(root, "pyproject.toml") versioneer_py = os.path.join(root, "versioneer.py") -<<<<<<< HEAD if not ( os.path.exists(setup_py) or os.path.exists(pyproject_toml) or os.path.exists(versioneer_py) ): -======= - if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): -<<<<<<< HEAD ->>>>>>> 4e967422 (fixing tests) err = ( "Versioneer was unable to run the project root directory. " "Versioneer requires setup.py to be executed from " @@ -385,13 +372,6 @@ def get_root() -> str: "or in a way that lets it use sys.argv[0] to find the root " "(like 'python path/to/setup.py COMMAND')." ) -======= - err = ("Versioneer was unable to run the project root directory. " - "Versioneer requires setup.py to be executed from " - "its immediate directory (like 'python setup.py COMMAND'), " - "or in a way that lets it use sys.argv[0] to find the root " - "(like 'python path/to/setup.py COMMAND').") ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools @@ -403,23 +383,11 @@ def get_root() -> str: my_path = os.path.realpath(os.path.abspath(__file__)) me_dir = os.path.normcase(os.path.splitext(my_path)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) -<<<<<<< HEAD if me_dir != vsr_dir and "VERSIONEER_PEP518" not in globals(): print( "Warning: build in %s is using versioneer.py from %s" % (os.path.dirname(my_path), versioneer_py) -======= - if me_dir != vsr_dir: -<<<<<<< HEAD - print( - "Warning: build in %s is using versioneer.py from %s" - % (os.path.dirname(me), versioneer_py) ->>>>>>> 4e967422 (fixing tests) ) -======= - print("Warning: build in %s is using versioneer.py from %s" - % (os.path.dirname(me), versioneer_py)) ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) except NameError: pass return root @@ -455,17 +423,6 @@ def get_config_from_root(root: str) -> VersioneerConfig: # common VersioneerConfig users at the moment. We verify against # `None` values elsewhere where it matters -<<<<<<< HEAD -======= - def get(parser, name): - if parser.has_option("versioneer", name): - return parser.get("versioneer", name) - return None -<<<<<<< HEAD - ->>>>>>> 4e967422 (fixing tests) -======= ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) cfg = VersioneerConfig() cfg.VCS = section["VCS"] cfg.style = section.get("style", "") @@ -493,24 +450,17 @@ class NotThisMethod(Exception): HANDLERS: Dict[str, Dict[str, Callable]] = {} -<<<<<<< HEAD def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" def decorate(f: Callable) -> Callable: -======= -def register_vcs_handler(vcs, method): # decorator - """Decorator to mark a method as the handler for a particular VCS.""" - def decorate(f): ->>>>>>> 4e967422 (fixing tests) """Store f in HANDLERS[vcs][method].""" HANDLERS.setdefault(vcs, {})[method] = f return f + return decorate -<<<<<<< HEAD -<<<<<<< HEAD def run_command( commands: List[str], args: List[str], @@ -519,13 +469,6 @@ def run_command( hide_stderr: bool = False, env: Optional[Dict[str, str]] = None, ) -> Tuple[Optional[str], Optional[int]]: -======= -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): ->>>>>>> 4e967422 (fixing tests) -======= -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) """Call the given command(s).""" assert isinstance(commands, list) process = None @@ -541,29 +484,14 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git -<<<<<<< HEAD -<<<<<<< HEAD process = subprocess.Popen( [command] + args, -======= - p = subprocess.Popen( - [c] + args, ->>>>>>> 4e967422 (fixing tests) cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), -<<<<<<< HEAD **popen_kwargs, -======= ->>>>>>> 4e967422 (fixing tests) ) -======= - p = subprocess.Popen([c] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) break except OSError as e: if e.errno == errno.ENOENT: @@ -585,17 +513,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, return stdout, process.returncode -<<<<<<< HEAD LONG_VERSION_PY[ "git" -<<<<<<< HEAD ] = r''' -======= -] = ''' ->>>>>>> 4e967422 (fixing tests) -======= -LONG_VERSION_PY['git'] = ''' ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -1340,15 +1260,7 @@ def git_versions_from_keywords( # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " -<<<<<<< HEAD -<<<<<<< HEAD tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} -======= - tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) ->>>>>>> 4e967422 (fixing tests) -======= - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -1357,15 +1269,7 @@ def git_versions_from_keywords( # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". -<<<<<<< HEAD -<<<<<<< HEAD tags = {r for r in refs if re.search(r"\d", r)} -======= - tags = set([r for r in refs if re.search(r"\d", r)]) ->>>>>>> 4e967422 (fixing tests) -======= - tags = set([r for r in refs if re.search(r'\d', r)]) ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -1373,31 +1277,31 @@ def git_versions_from_keywords( for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): -<<<<<<< HEAD r = ref[len(tag_prefix) :] -<<<<<<< HEAD # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') if not re.match(r"\d", r): continue -======= ->>>>>>> 4e967422 (fixing tests) -======= - r = ref[len(tag_prefix):] ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) if verbose: print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } @register_vcs_handler("git", "pieces_from_vcs") @@ -1414,8 +1318,6 @@ def git_pieces_from_vcs( if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] -<<<<<<< HEAD -<<<<<<< HEAD # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, # but that should not change where we get our version from. @@ -1424,13 +1326,6 @@ def git_pieces_from_vcs( runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) -======= - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) ->>>>>>> 4e967422 (fixing tests) -======= - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -1438,12 +1333,7 @@ def git_pieces_from_vcs( # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) -<<<<<<< HEAD -<<<<<<< HEAD describe_out, rc = runner( -======= - describe_out, rc = run_command( ->>>>>>> 4e967422 (fixing tests) GITS, [ "describe", @@ -1452,20 +1342,10 @@ def git_pieces_from_vcs( "--always", "--long", "--match", -<<<<<<< HEAD f"{tag_prefix}[[:digit:]]*", -======= - "%s*" % tag_prefix, ->>>>>>> 4e967422 (fixing tests) ], cwd=root, ) -======= - describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%s*" % tag_prefix], - cwd=root) ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -1520,25 +1400,16 @@ def git_pieces_from_vcs( dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: -<<<<<<< HEAD # unparsable. Maybe git-describe is misbehaving? -======= - # unparseable. Maybe git-describe is misbehaving? -<<<<<<< HEAD ->>>>>>> 4e967422 (fixing tests) pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out -======= - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) return pieces # tag @@ -1547,10 +1418,12 @@ def git_pieces_from_vcs( if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -1561,8 +1434,6 @@ def git_pieces_from_vcs( else: # HEX: no tags pieces["closest-tag"] = None -<<<<<<< HEAD -<<<<<<< HEAD out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) pieces["distance"] = len(out.split()) # total number of commits @@ -1571,24 +1442,6 @@ def git_pieces_from_vcs( # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] -======= - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) - pieces["distance"] = int(count_out) # total number of commits - - # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ - 0 - ].strip() ->>>>>>> 4e967422 (fixing tests) -======= - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) - pieces["distance"] = int(count_out) # total number of commits - - # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], - cwd=root)[0].strip() ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces @@ -1648,7 +1501,6 @@ def versions_from_parentdir( for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): -<<<<<<< HEAD return { "version": dirname[len(parentdir_prefix) :], "full-revisionid": None, @@ -1656,23 +1508,14 @@ def versions_from_parentdir( "error": None, "date": None, } -<<<<<<< HEAD rootdirs.append(root) root = os.path.dirname(root) # up a level -======= -======= - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) - else: - rootdirs.append(root) - root = os.path.dirname(root) # up a level ->>>>>>> 4e967422 (fixing tests) if verbose: - print("Tried directories %s but none started with prefix %s" % - (str(rootdirs), parentdir_prefix)) + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -1701,11 +1544,13 @@ def versions_from_file(filename: str) -> Dict[str, Any]: contents = f.read() except OSError: raise NotThisMethod("unable to read _version.py") - mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) + mo = re.search( + r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) if not mo: - mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) + mo = re.search( + r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) @@ -1713,16 +1558,7 @@ def versions_from_file(filename: str) -> Dict[str, Any]: def write_to_version_file(filename: str, versions: Dict[str, Any]) -> None: """Write the given version number to the given _version.py file.""" -<<<<<<< HEAD -======= - os.unlink(filename) -<<<<<<< HEAD ->>>>>>> 4e967422 (fixing tests) contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) -======= - contents = json.dumps(versions, sort_keys=True, - indent=1, separators=(",", ": ")) ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) @@ -1754,8 +1590,7 @@ def render_pep440(pieces: Dict[str, Any]) -> str: rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -1945,11 +1780,13 @@ def render_git_describe_long(pieces: Dict[str, Any]) -> str: def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } if not style or style == "default": style = "pep440" # the default @@ -1973,9 +1810,13 @@ def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: else: raise ValueError("unknown style '%s'" % style) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } class VersioneerBadRootError(Exception): @@ -1997,19 +1838,10 @@ def get_versions(verbose: bool = False) -> Dict[str, Any]: assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS -<<<<<<< HEAD verbose = verbose or bool(cfg.verbose) # `bool()` used to avoid `None` -======= - verbose = verbose or cfg.verbose -<<<<<<< HEAD ->>>>>>> 4e967422 (fixing tests) assert ( cfg.versionfile_source is not None ), "please set versioneer.versionfile_source" -======= - assert cfg.versionfile_source is not None, \ - "please set versioneer.versionfile_source" ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) @@ -2063,9 +1895,13 @@ def get_versions(verbose: bool = False) -> Dict[str, Any]: if verbose: print("unable to compute version") - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, "error": "unable to compute version", - "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } def get_version() -> str: @@ -2118,6 +1954,7 @@ def run(self) -> None: print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version # we override "build_py" in setuptools @@ -2157,10 +1994,10 @@ def run(self) -> None: # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: - target_versionfile = os.path.join(self.build_lib, - cfg.versionfile_build) + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py if "build_ext" in cmds: @@ -2220,17 +2057,21 @@ def run(self) -> None: os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + cmds["build_exe"] = cmd_build_exe del cmds["build_py"] - if 'py2exe' in sys.modules: # py2exe enabled? + if "py2exe" in sys.modules: # py2exe enabled? try: from py2exe.setuptools_buildexe import py2exe as _py2exe # type: ignore except ImportError: @@ -2249,13 +2090,17 @@ def run(self) -> None: os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + cmds["py2exe"] = cmd_py2exe # sdist farms its file list building out to egg_info @@ -2322,8 +2167,10 @@ def make_release_tree(self, base_dir: str, files: List[str]) -> None: # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, - self._versioneer_generated_versions) + write_to_version_file( + target_versionfile, self._versioneer_generated_versions + ) + cmds["sdist"] = cmd_sdist return cmds @@ -2383,26 +2230,9 @@ def do_setup() -> int: root = get_root() try: cfg = get_config_from_root(root) -<<<<<<< HEAD -<<<<<<< HEAD except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: if isinstance(e, (OSError, configparser.NoSectionError)): -======= - except ( - EnvironmentError, - configparser.NoSectionError, - configparser.NoOptionError, - ) as err: - if isinstance(err, (EnvironmentError, configparser.NoSectionError)): ->>>>>>> 4e967422 (fixing tests) print("Adding sample versioneer config to setup.cfg", file=sys.stderr) -======= - except (EnvironmentError, configparser.NoSectionError, - configparser.NoOptionError) as err: - if isinstance(err, (EnvironmentError, configparser.NoSectionError)): - print("Adding sample versioneer config to setup.cfg", - file=sys.stderr) ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) @@ -2411,23 +2241,19 @@ def do_setup() -> int: print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) - -<<<<<<< HEAD + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") -<<<<<<< HEAD maybe_ipy: Optional[str] = ipy -======= ->>>>>>> 4e967422 (fixing tests) -======= - ipy = os.path.join(os.path.dirname(cfg.versionfile_source), - "__init__.py") ->>>>>>> 352ca3b6 (removing additional parameter from try_cast_to_pandas) if os.path.exists(ipy): try: with open(ipy, "r") as f: @@ -2448,43 +2274,7 @@ def do_setup() -> int: print(" %s unmodified" % ipy) else: print(" %s doesn't exist, ok" % ipy) -<<<<<<< HEAD maybe_ipy = None -======= - ipy = None - - # Make sure both the top-level "versioneer.py" and versionfile_source - # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so - # they'll be copied into source distributions. Pip won't be able to - # install the package without this. - manifest_in = os.path.join(root, "MANIFEST.in") - simple_includes = set() - try: - with open(manifest_in, "r") as f: - for line in f: - if line.startswith("include "): - for include in line.split()[1:]: - simple_includes.add(include) - except EnvironmentError: - pass - # That doesn't cover everything MANIFEST.in can do - # (http://docs.python.org/2/distutils/sourcedist.html#commands), so - # it might give some false negatives. Appending redundant 'include' - # lines is safe, though. - if "versioneer.py" not in simple_includes: - print(" appending 'versioneer.py' to MANIFEST.in") - with open(manifest_in, "a") as f: - f.write("include versioneer.py\n") - else: - print(" 'versioneer.py' already in MANIFEST.in") - if cfg.versionfile_source not in simple_includes: - print(" appending versionfile_source ('%s') to MANIFEST.in" % - cfg.versionfile_source) - with open(manifest_in, "a") as f: - f.write("include %s\n" % cfg.versionfile_source) - else: - print(" versionfile_source already in MANIFEST.in") ->>>>>>> 4e967422 (fixing tests) # Make VCS-specific changes. For git, this means creating/changing # .gitattributes to mark _version.py for export-subst keyword From c748be677d2a5f0f73cdccb3e8f36ab5d14259fd Mon Sep 17 00:00:00 2001 From: Arun Jose <40291569+arunjose696@users.noreply.github.com> Date: Thu, 4 Jul 2024 21:37:09 +0200 Subject: [PATCH 18/19] Apply suggestions from code review Co-authored-by: Iaroslav Igoshev --- .github/workflows/ci.yml | 2 +- modin/config/envvars.py | 2 +- .../storage_formats/pandas/native_query_compiler.py | 10 ++++++---- modin/tests/pandas/dataframe/test_iter.py | 3 ++- modin/tests/test_utils.py | 2 +- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 08da1d664c0..f0fc8437480 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -214,7 +214,7 @@ jobs: unidist: ${{ steps.filter.outputs.unidist }} engines: ${{ steps.engines.outputs.engines }} experimental: ${{ steps.experimental.outputs.experimental }} - test-small-query-compiler: ${{ steps.filter.outputs.test-small-query-compiler }} + test-native-dataframe-mode: ${{ steps.filter.outputs.test-native-dataframe-mode }} steps: - uses: actions/checkout@v4 - uses: dorny/paths-filter@v3 diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 59f3dab1d03..f39f4a79ea2 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -926,7 +926,7 @@ class NativeDataframeMode(EnvironmentVariable, type=str): """ varname = "MODIN_NATIVE_DATAFRAME_MODE" - choices = ("Pandas",) + choices = ("Default", "Pandas",) default = "Default" diff --git a/modin/core/storage_formats/pandas/native_query_compiler.py b/modin/core/storage_formats/pandas/native_query_compiler.py index bd89ec4775e..39733724a35 100644 --- a/modin/core/storage_formats/pandas/native_query_compiler.py +++ b/modin/core/storage_formats/pandas/native_query_compiler.py @@ -44,7 +44,7 @@ def _get_axis(axis): Returns ------- - callable(PandasQueryCompiler) -> pandas.Index + callable(NativeQueryCompiler) -> pandas.Index """ if axis == 0: return lambda self: self._modin_frame.index @@ -63,7 +63,7 @@ def _set_axis(axis): Returns ------- - callable(PandasQueryCompiler) + callable(NativeQueryCompiler) """ if axis == 0: @@ -607,7 +607,8 @@ def set_frame_dtypes_cache(self, dtypes): Notes ----- - This function is for consistency with other QCs, dtypes should be assigned directly on the frame. + This function is for consistency with other QCs, + dtypes should be assigned directly on the frame. """ pass @@ -621,7 +622,8 @@ def set_frame_index_cache(self, index): Notes ----- - This function is for consistency with other QCs, dtypes should be assigned directly on the frame. + This function is for consistency with other QCs, + index should be assigned directly on the frame. """ pass diff --git a/modin/tests/pandas/dataframe/test_iter.py b/modin/tests/pandas/dataframe/test_iter.py index ccd6e632d10..e420f154a0f 100644 --- a/modin/tests/pandas/dataframe/test_iter.py +++ b/modin/tests/pandas/dataframe/test_iter.py @@ -142,7 +142,8 @@ def test_display_options_for___repr__(max_rows_columns, expand_frame_repr, frame def test___finalize__(): data = test_data_values[0] - # Using force for warns_that_defaulting_to_pandas as the warnings are raised in Dataframe layer, before geting into QueryCompiler layer. + # Using `force` for `NativeDataframeMode` as the warnings are raised at the API layer, + # before geting into the Query Compiler layer. with warns_that_defaulting_to_pandas(force=True): pd.DataFrame(data).__finalize__(None) diff --git a/modin/tests/test_utils.py b/modin/tests/test_utils.py index 55075f3a743..1597b052853 100644 --- a/modin/tests/test_utils.py +++ b/modin/tests/test_utils.py @@ -263,7 +263,7 @@ def warns_that_defaulting_to_pandas(prefix=None, suffix=None, force=False): If specified, checks that the end of the warning message matches this argument after "[Dd]efaulting to pandas". force : Optional[bool] - If true return the pytest.recwarn.WarningsChecker irrespective of ``NativeDataframeMode`` + If ``True``, return the ``pytest.recwarn.WarningsChecker`` irrespective of ``NativeDataframeMode``. Returns ------- From 4f40c12c4a2f464b4edb6ecedec2ffb9c9003bb4 Mon Sep 17 00:00:00 2001 From: arunjose696 Date: Thu, 4 Jul 2024 16:06:25 -0500 Subject: [PATCH 19/19] PR comments Signed-off-by: arunjose696 --- .github/workflows/ci.yml | 11 ++---- modin/config/envvars.py | 11 ++++-- .../pandas/native_query_compiler.py | 39 ++++++++++++------- modin/tests/pandas/dataframe/test_iter.py | 2 + 4 files changed, 40 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f0fc8437480..76f9eab1b52 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -221,9 +221,8 @@ jobs: id: filter with: filters: | - test-small-query-compiler: + test-native-dataframe-mode: - 'modin/core/storage_formats/pandas/native_query_compiler.py' - - 'modin/core/storage_formats/pandas/query_compiler.py' - 'modin/core/storage_formats/base/query_compiler.py' shared: &shared - 'modin/core/execution/dispatching/**' @@ -636,9 +635,9 @@ jobs: python-version: ${{matrix.python-version}} - run: python -m pytest modin/tests/experimental/spreadsheet/test_general.py - test-small-query-compiler: + test-native-dataframe-mode: needs: [ lint-flake8, execution-filter] - if: ${{ needs.execution-filter.outputs.test-small-query-compiler == 'true' }} + if: ${{ needs.execution-filter.outputs.test-native-dataframe-mode == 'true' }} runs-on: ubuntu-latest defaults: run: @@ -648,15 +647,13 @@ jobs: python-version: ["3.9"] env: MODIN_NATIVE_DATAFRAME_MODE: "Pandas" - name: test-small-query-compiler python ${{matrix.python-version}}) + name: test-native-dataframe-mode python ${{matrix.python-version}}) steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml python-version: ${{matrix.python-version}} - - run: python -m pytest modin/tests/config/test_envvars.py - - run: python -m pytest modin/tests/config/test_parameter.py - run: python -m pytest modin/tests/pandas/dataframe/test_binary.py - run: python -m pytest modin/tests/pandas/dataframe/test_default.py - run: python -m pytest modin/tests/pandas/dataframe/test_indexing.py diff --git a/modin/config/envvars.py b/modin/config/envvars.py index f39f4a79ea2..676f1a31d8a 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -915,18 +915,23 @@ def _check_vars() -> None: class NativeDataframeMode(EnvironmentVariable, type=str): """ + Configures the query compiler to process Modin data. + When this config is set to ``Default``, ``PandasQueryCompiler`` is used, which leads to Modin executing dataframes in distributed fashion. - When set to a string (e.g., ``Pandas``), ``NativeQueryCompiler`` is used, + When set to a string (e.g., ``pandas``), ``NativeQueryCompiler`` is used, which handles the dataframes without distributing, - falling back to native library functions (e.g., ``Pandas``). + falling back to native library functions (e.g., ``pandas``). This could be beneficial for handling relatively small dataframes without involving additional overhead of communication between processes. """ varname = "MODIN_NATIVE_DATAFRAME_MODE" - choices = ("Default", "Pandas",) + choices = ( + "Default", + "Pandas", + ) default = "Default" diff --git a/modin/core/storage_formats/pandas/native_query_compiler.py b/modin/core/storage_formats/pandas/native_query_compiler.py index 39733724a35..bfe331cfc6e 100644 --- a/modin/core/storage_formats/pandas/native_query_compiler.py +++ b/modin/core/storage_formats/pandas/native_query_compiler.py @@ -352,6 +352,20 @@ def binary_operator(df, other, **kwargs): def _register_expanding(func): + """ + Build function that apply specified expanding window functions. + + Parameters + ---------- + func : str + Expanding window functionname to apply. + + Returns + ------- + callable(pandas.DataFrame, *args, **kwargs) -> pandas.DataFrame + Function to be applied to the frame. + """ + def expanding_operator(df, fold_axis, rolling_args, *args, **kwargs): squeeze_self = kwargs.pop("squeeze_self", False) @@ -497,7 +511,6 @@ def _register_default_pandas( squeeze_kwargs=False, return_raw=False, in_place=False, - filter_kwargs=[], ): """ Build function that apply specified method of the passed frame. @@ -516,8 +529,6 @@ def _register_default_pandas( If True, and the result not DataFrame or Series it is returned as is without wrapping in query compiler. in_place : bool, default: False If True, the specified function will be applied on the passed frame in place. - filter_kwargs : list, default: [] - List of key word argument names to remove. Returns ------- @@ -529,7 +540,7 @@ def caller(query_compiler, *args, **kwargs): df = query_compiler._modin_frame if is_series: df = df.squeeze(axis=1) - exclude_names = ["fold_axis"] + filter_kwargs + exclude_names = ["fold_axis", "dtypes"] kwargs = kwargs.copy() for name in exclude_names: kwargs.pop(name, None) @@ -565,7 +576,9 @@ class NativeQueryCompiler(BaseQueryCompiler): Parameters ---------- pandas_frame : pandas.DataFrame - Pandas frame to query with the compiled queries. + The pandas frame to query with the compiled queries. + shape_hint : {"row", "column", None}, default: None + Shape hint for frames known to be a column or a row, otherwise None. """ _modin_frame: pandas.DataFrame @@ -767,7 +780,7 @@ def setitem_bool(self, row_loc, col_loc, item): dt_weekofyear = _register_default_pandas(_dt_prop_map("weekofyear")) dt_year = _register_default_pandas(_dt_prop_map("year")) duplicated = _register_default_pandas(pandas.DataFrame.duplicated) - eq = _register_default_pandas(_register_binary("eq"), filter_kwargs=["dtypes"]) + eq = _register_default_pandas(_register_binary("eq")) equals = _register_default_pandas(_register_binary("equals")) eval = _register_default_pandas(pandas.DataFrame.eval) explode = _register_default_pandas(pandas.DataFrame.explode) @@ -819,7 +832,7 @@ def setitem_bool(self, row_loc, col_loc, item): pandas.DataFrame.first_valid_index, return_raw=True ) floordiv = _register_default_pandas(_register_binary("floordiv")) - ge = _register_default_pandas(_register_binary("ge"), filter_kwargs=["dtypes"]) + ge = _register_default_pandas(_register_binary("ge")) get_dummies = _register_default_pandas(_get_dummies) getitem_array = _register_default_pandas(_getitem_array) getitem_row_array = _register_default_pandas(_getitem_row_array) @@ -846,7 +859,7 @@ def setitem_bool(self, row_loc, col_loc, item): groupby_std = _register_default_pandas(_groupby("std")) groupby_sum = _register_default_pandas(_groupby("sum")) groupby_var = _register_default_pandas(_groupby("var")) - gt = _register_default_pandas(_register_binary("gt"), filter_kwargs=["dtypes"]) + gt = _register_default_pandas(_register_binary("gt")) idxmax = _register_default_pandas(pandas.DataFrame.idxmax) idxmin = _register_default_pandas(pandas.DataFrame.idxmin) infer_objects = _register_default_pandas( @@ -871,8 +884,8 @@ def setitem_bool(self, row_loc, col_loc, item): last_valid_index = _register_default_pandas( pandas.DataFrame.last_valid_index, return_raw=True ) - le = _register_default_pandas(_register_binary("le"), filter_kwargs=["dtypes"]) - lt = _register_default_pandas(_register_binary("lt"), filter_kwargs=["dtypes"]) + le = _register_default_pandas(_register_binary("le")) + lt = _register_default_pandas(_register_binary("lt")) # mad = _register_default_pandas(pandas.DataFrame.mad) mask = _register_default_pandas(pandas.DataFrame.mask) max = _register_default_pandas(pandas.DataFrame.max) @@ -886,7 +899,7 @@ def setitem_bool(self, row_loc, col_loc, item): mod = _register_default_pandas(_register_binary("mod")) mode = _register_default_pandas(pandas.DataFrame.mode) mul = _register_default_pandas(_register_binary("mul")) - ne = _register_default_pandas(_register_binary("ne"), filter_kwargs=["dtypes"]) + ne = _register_default_pandas(_register_binary("ne")) negative = _register_default_pandas(pandas.DataFrame.__neg__) nlargest = _register_default_pandas(pandas.DataFrame.nlargest) notna = _register_default_pandas(pandas.DataFrame.notna) @@ -1214,7 +1227,7 @@ def from_pandas(cls, df, data_cls): @classmethod def from_arrow(cls, at, data_cls): - return + return cls(at.to_pandas()) def free(self): return @@ -1231,7 +1244,7 @@ def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): @classmethod def from_dataframe(cls, df, data_cls): - return cls(data_cls.from_dataframe(df)) + return cls(pandas.api.interchange.from_dataframe(df)) # END Dataframe exchange protocol diff --git a/modin/tests/pandas/dataframe/test_iter.py b/modin/tests/pandas/dataframe/test_iter.py index e420f154a0f..38ab70524a2 100644 --- a/modin/tests/pandas/dataframe/test_iter.py +++ b/modin/tests/pandas/dataframe/test_iter.py @@ -232,6 +232,8 @@ def test___repr__(): "2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0 "2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0""" pandas_df = pandas.read_csv(io.StringIO(string_data)) + # Using `force` for `NativeDataframeMode` as the warnings are raised at the API layer, + # before geting into the Query Compiler layer. with warns_that_defaulting_to_pandas(force=True): modin_df = pd.read_csv(io.StringIO(string_data)) assert repr(pandas_df) == repr(modin_df)