From 8c6544e5be8f601547cbb420d26510d83e69d6b7 Mon Sep 17 00:00:00 2001 From: "Igoshev, Iaroslav" Date: Thu, 16 May 2024 15:06:32 +0000 Subject: [PATCH] Enable test_default.py and test_join_sort.py Signed-off-by: Igoshev, Iaroslav --- .github/workflows/ci.yml | 2 +- modin/config/__init__.py | 4 +- modin/config/envvars.py | 6 +- .../pandas/small_query_compiler.py | 142 ++++-------------- modin/pandas/base.py | 4 +- modin/pandas/dataframe.py | 8 +- modin/pandas/io.py | 8 +- modin/pandas/series.py | 8 +- modin/tests/pandas/dataframe/test_binary.py | 11 +- modin/tests/pandas/dataframe/test_default.py | 33 ++-- modin/tests/pandas/dataframe/test_indexing.py | 24 +-- .../tests/pandas/dataframe/test_join_sort.py | 32 ++-- modin/tests/pandas/test_expanding.py | 6 +- setup.cfg | 2 +- 14 files changed, 109 insertions(+), 181 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f652d9e918b..504727f7868 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -717,7 +717,7 @@ jobs: matrix: python-version: ["3.9"] env: - MODIN_SMALL_QUERY_COMPILER: "True" + MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER: "True" name: test-small-query-compiler python ${{matrix.python-version}}) steps: - uses: actions/checkout@v4 diff --git a/modin/config/__init__.py b/modin/config/__init__.py index 47cf28a6de0..4ef61d6ef62 100644 --- a/modin/config/__init__.py +++ b/modin/config/__init__.py @@ -32,7 +32,6 @@ GpuCount, HdkFragmentSize, HdkLaunchParameters, - InitializeWithSmallQueryCompilers, IsDebug, IsExperimental, IsRayCluster, @@ -58,6 +57,7 @@ TestReadFromPostgres, TestReadFromSqlServer, TrackFileLeaks, + UsePlainPandasQueryCompiler, use_range_partitioning_groupby, ) from modin.config.pubsub import Parameter, ValueSource, context @@ -74,7 +74,7 @@ "CpuCount", "GpuCount", "Memory", - "InitializeWithSmallQueryCompilers", + "UsePlainPandasQueryCompiler", # Ray specific "IsRayCluster", "RayRedisAddress", diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 052c12d9308..126b516d388 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -1051,10 +1051,10 @@ def _check_vars() -> None: ) -class InitializeWithSmallQueryCompilers(EnvironmentVariable, type=str): - """Set to true to use implementation of SmallQueryCompiler.""" +class UsePlainPandasQueryCompiler(EnvironmentVariable, type=bool): + """Set to true to use implementation of PlainPandasQueryCompiler.""" - varname = "MODIN_SMALL_QUERY_COMPILER" + varname = "MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER" default = False diff --git a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py index 8112fbd11f4..d6767478926 100644 --- a/modin/experimental/core/storage_formats/pandas/small_query_compiler.py +++ b/modin/experimental/core/storage_formats/pandas/small_query_compiler.py @@ -12,21 +12,18 @@ # governing permissions and limitations under the License. """ -Module contains ``SmallQueryCompiler`` class. +Module contains ``PlainPandasQueryCompiler`` class. -``SmallQueryCompiler`` is responsible for compiling efficient DataFrame algebra +``PlainPandasQueryCompiler`` is responsible for compiling efficient DataFrame algebra queries for small data and empty ``PandasDataFrame``. """ -import warnings - import numpy as np import pandas from pandas.core.dtypes.common import is_list_like, is_scalar -from modin.config.envvars import InitializeWithSmallQueryCompilers +from modin.config.envvars import UsePlainPandasQueryCompiler from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler -from modin.error_message import ErrorMessage from modin.utils import ( MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings, @@ -48,9 +45,9 @@ def _get_axis(axis): callable(PandasQueryCompiler) -> pandas.Index """ if axis == 0: - return lambda self: self._pandas_frame.index + return lambda self: self._modin_frame.index else: - return lambda self: self._pandas_frame.columns + return lambda self: self._modin_frame.columns def _set_axis(axis): @@ -69,12 +66,12 @@ def _set_axis(axis): if axis == 0: def set_axis(self, idx): - self._pandas_frame.index = idx + self._modin_frame.index = idx else: def set_axis(self, cols): - self._pandas_frame.columns = cols + self._modin_frame.columns = cols return set_axis @@ -572,7 +569,7 @@ def _register_default_pandas( """ def caller(query_compiler, *args, **kwargs): - df = query_compiler._pandas_frame + df = query_compiler._modin_frame if df_copy: df = df.copy() if is_series: @@ -605,21 +602,22 @@ def caller(query_compiler, *args, **kwargs): @_inherit_docstrings(BaseQueryCompiler) -class SmallQueryCompiler(BaseQueryCompiler): +class PlainPandasQueryCompiler(BaseQueryCompiler): """ Query compiler for the pandas storage format. - This class translates common query compiler API to default all methods - to pandas. + This class translates common query compiler API into + plain pandas to execute operations on small data + depending on the threshold. Parameters ---------- pandas_frame : pandas.DataFrame - Modin Frame to query with the compiled queries. + Pandas frame to query with the compiled queries. """ def __init__(self, pandas_frame): - assert InitializeWithSmallQueryCompilers.get() + assert UsePlainPandasQueryCompiler.get() if hasattr(pandas_frame, "_to_pandas"): pandas_frame = pandas_frame._to_pandas() if is_scalar(pandas_frame): @@ -627,99 +625,23 @@ def __init__(self, pandas_frame): elif not isinstance(pandas_frame, pandas.DataFrame): pandas_frame = pandas.DataFrame(pandas_frame) - self._pandas_frame = pandas_frame - - # def default_to_pandas(self, pandas_op, *args, **kwargs): - # args = (a.to_pandas() if isinstance(a, type(self)) else a for a in args) - # kwargs = { - # k: v.to_pandas if isinstance(v, type(self)) else v - # for k, v in kwargs.items() - # } - # op_name = getattr(pandas_op, "__name__", str(pandas_op)) - # ErrorMessage.default_to_pandas(op_name) - - # result = pandas_op(self._pandas_frame, *args, **kwargs) - # if isinstance(result, pandas.Series): - # if result.name is None: - # result.name = MODIN_UNNAMED_SERIES_LABEL - # result = result.to_frame() - - # return result - - def default_to_pandas(self, pandas_op, *args, **kwargs): - """ - Do fallback to pandas for the passed function. - - Parameters - ---------- - pandas_op : callable(pandas.DataFrame) -> object - Function to apply to the casted to pandas frame. - *args : iterable - Positional arguments to pass to `pandas_op`. - **kwargs : dict - Key-value arguments to pass to `pandas_op`. - - Returns - ------- - BaseQueryCompiler - The result of the `pandas_op`, converted back to ``BaseQueryCompiler``. - """ - op_name = getattr(pandas_op, "__name__", str(pandas_op)) - ErrorMessage.default_to_pandas(op_name) - args = try_cast_to_pandas(args) - kwargs = try_cast_to_pandas(kwargs) - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=FutureWarning) - result = pandas_op(try_cast_to_pandas(self), *args, **kwargs) - if isinstance(result, (tuple, list)): - if "Series.tolist" in pandas_op.__name__: - # fast path: no need to iterate over the result from `tolist` function - return result - return [self.__wrap_in_qc(obj) for obj in result] - # breakpoint() - return type(self)(result) - - def __wrap_in_qc(self, obj): - """ - Wrap `obj` in query compiler. - - Parameters - ---------- - obj : any - Object to wrap. - - Returns - ------- - BaseQueryCompiler - Query compiler wrapping the object. - """ - if isinstance(obj, pandas.Series): - if obj.name is None: - obj.name = MODIN_UNNAMED_SERIES_LABEL - obj = obj.to_frame() - if isinstance(obj, pandas.DataFrame): - return self.from_pandas(obj, type(self._pandas_frame)) - else: - return obj + self._modin_frame = pandas_frame def execute(self): - """Wait for all computations to complete without materializing data.""" pass def take_2d_positional(self, index=None, columns=None): index = slice(None) if index is None else index columns = slice(None) if columns is None else columns - self._pandas_frame.iloc[index, columns] - return self.__constructor__(self._pandas_frame.iloc[index, columns]) + return self.__constructor__(self._modin_frame.iloc[index, columns]) def copy(self): - return self.__constructor__(self._pandas_frame.copy()) + return self.__constructor__(self._modin_frame.copy()) def setitem_bool(self, row_loc, col_loc, item): - self._pandas_frame.loc[row_loc._pandas_frame.squeeze(axis=1), col_loc] = item - return self.__constructor__(self._pandas_frame) + self._modin_frame.loc[row_loc._modin_frame.squeeze(axis=1), col_loc] = item + return self.__constructor__(self._modin_frame) __and__ = _register_default_pandas(pandas.DataFrame.__and__, squeeze_series=True) __dir__ = _register_default_pandas(pandas.DataFrame.__dir__) @@ -943,7 +865,7 @@ def setitem_bool(self, row_loc, col_loc, item): mask = _register_default_pandas(pandas.DataFrame.mask) max = _register_default_pandas(pandas.DataFrame.max) map = _register_default_pandas(pandas.DataFrame.map) - mean = _register_default_pandas(pandas.DataFrame.mean) + mean = _register_default_pandas(pandas.DataFrame.mean, return_modin=False) median = _register_default_pandas(pandas.DataFrame.median) melt = _register_default_pandas(pandas.DataFrame.melt) memory_usage = _register_default_pandas(pandas.DataFrame.memory_usage) @@ -1140,9 +1062,9 @@ def dot(self, other, squeeze_self=None, squeeze_other=None): if squeeze_other: other = other.squeeze() if squeeze_self: - result = self._pandas_frame.squeeze(axis=1).dot(other) + result = self._modin_frame.squeeze(axis=1).dot(other) else: - result = self._pandas_frame.dot(other) + result = self._modin_frame.dot(other) if isinstance(result, pandas.Series): if result.name is None: result.name = "__reduced__" @@ -1227,7 +1149,7 @@ def expanding_corr( ) def get_axis(self, axis): - return self._pandas_frame.index if axis == 0 else self._pandas_frame.columns + return self._modin_frame.index if axis == 0 else self._modin_frame.columns def get_index_name(self, axis=0): return self.get_axis(axis).name @@ -1240,9 +1162,9 @@ def set_index_name(self, name, axis=0): def has_multiindex(self, axis=0): if axis == 0: - return isinstance(self._pandas_frame.index, pandas.MultiIndex) + return isinstance(self._modin_frame.index, pandas.MultiIndex) assert axis == 1 - return isinstance(self._pandas_frame.columns, pandas.MultiIndex) + return isinstance(self._modin_frame.columns, pandas.MultiIndex) def isin(self, values, ignore_indices=False, **kwargs): if isinstance(values, type(self)) and ignore_indices: @@ -1258,7 +1180,7 @@ def isin(self, values, ignore_indices=False, **kwargs): ) def to_pandas(self): - return self._pandas_frame + return self._modin_frame @classmethod def from_pandas(cls, df, data_cls): @@ -1277,7 +1199,7 @@ def finalize(self): # Dataframe exchange protocol def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): - return self._pandas_frame.__dataframe__( + return self._modin_frame.__dataframe__( nan_as_null=nan_as_null, allow_copy=allow_copy ) @@ -1292,14 +1214,12 @@ def from_dataframe(cls, df, data_cls): @property def dtypes(self): - return self._pandas_frame.dtypes + return self._modin_frame.dtypes def getitem_column_array(self, key, numeric=False, ignore_order=False): if numeric: - return self.__constructor__(self._pandas_frame.iloc[:, key]) - return self.__constructor__(self._pandas_frame.loc[:, key]) + return self.__constructor__(self._modin_frame.iloc[:, key]) + return self.__constructor__(self._modin_frame.loc[:, key]) def is_series_like(self): - return ( - len(self._pandas_frame.columns) == 1 or len(self._pandas_frame.index) == 1 - ) + return len(self._modin_frame.columns) == 1 or len(self._modin_frame.index) == 1 diff --git a/modin/pandas/base.py b/modin/pandas/base.py index b85d6438b67..84cde4a32bd 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -67,7 +67,7 @@ from modin import pandas as pd from modin.error_message import ErrorMessage from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - SmallQueryCompiler, + PlainPandasQueryCompiler, ) from modin.logging import ClassLogger, disable_logging from modin.pandas.accessor import CachedAccessor, ModinAPI @@ -286,7 +286,7 @@ def _build_repr_df( indexer = row_indexer, _get_repr_axis_label_indexer(self.columns, num_cols) else: indexer = row_indexer - if isinstance(self._query_compiler, SmallQueryCompiler): + if isinstance(self._query_compiler, PlainPandasQueryCompiler): return self._query_compiler.to_pandas().iloc[indexer] return self.iloc[indexer]._query_compiler.to_pandas() diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 64e77404922..8fbfa74f8a4 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -57,10 +57,10 @@ from pandas.io.formats.info import DataFrameInfo from pandas.util._validators import validate_bool_kwarg -from modin.config import InitializeWithSmallQueryCompilers, PersistentPickle +from modin.config import PersistentPickle, UsePlainPandasQueryCompiler from modin.error_message import ErrorMessage from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - SmallQueryCompiler, + PlainPandasQueryCompiler, ) from modin.logging import disable_logging from modin.pandas import Categorical @@ -261,11 +261,11 @@ def __init__( else: self._query_compiler = query_compiler - if query_compiler is None and InitializeWithSmallQueryCompilers.get(): + if query_compiler is None and UsePlainPandasQueryCompiler.get(): small_dataframe = pandas.DataFrame( data=data, index=index, columns=columns, dtype=dtype, copy=copy ) - self._query_compiler = SmallQueryCompiler(small_dataframe) + self._query_compiler = PlainPandasQueryCompiler(small_dataframe) def __repr__(self) -> str: """ diff --git a/modin/pandas/io.py b/modin/pandas/io.py index 89d6d8f80e8..24ce1da3359 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -65,10 +65,10 @@ from pandas.io.parsers import TextFileReader from pandas.io.parsers.readers import _c_parser_defaults -from modin.config import ExperimentalNumPyAPI, InitializeWithSmallQueryCompilers +from modin.config import ExperimentalNumPyAPI, UsePlainPandasQueryCompiler from modin.error_message import ErrorMessage from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - SmallQueryCompiler, + PlainPandasQueryCompiler, ) from modin.logging import ClassLogger, enable_logging from modin.utils import ( @@ -995,8 +995,8 @@ def from_pandas(df) -> DataFrame: """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher - if InitializeWithSmallQueryCompilers.get(): - return ModinObjects.DataFrame(query_compiler=SmallQueryCompiler(df)) + if UsePlainPandasQueryCompiler.get(): + return ModinObjects.DataFrame(query_compiler=PlainPandasQueryCompiler(df)) return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_pandas(df)) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index ede267bfedb..749cf0f6a50 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -31,9 +31,9 @@ from pandas.io.formats.info import SeriesInfo from pandas.util._validators import validate_bool_kwarg -from modin.config import InitializeWithSmallQueryCompilers, PersistentPickle +from modin.config import PersistentPickle, UsePlainPandasQueryCompiler from modin.experimental.core.storage_formats.pandas.small_query_compiler import ( - SmallQueryCompiler, + PlainPandasQueryCompiler, ) from modin.logging import disable_logging from modin.pandas.io import from_pandas, to_pandas @@ -147,8 +147,8 @@ def __init__( name = MODIN_UNNAMED_SERIES_LABEL if isinstance(data, pandas.Series) and data.name is not None: name = data.name - if InitializeWithSmallQueryCompilers.get(): - query_compiler = SmallQueryCompiler( + if UsePlainPandasQueryCompiler.get(): + query_compiler = PlainPandasQueryCompiler( pandas.DataFrame( pandas.Series( data=data, diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index d4c6995ce00..4f28316b327 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -17,12 +17,7 @@ import pytest import modin.pandas as pd -from modin.config import ( - Engine, - InitializeWithSmallQueryCompilers, - NPartitions, - StorageFormat, -) +from modin.config import Engine, NPartitions, StorageFormat, UsePlainPandasQueryCompiler from modin.core.dataframe.pandas.partitioning.axis_partition import ( PandasDataframeAxisPartition, ) @@ -223,8 +218,8 @@ def operation(df): reason="Modin on this engine doesn't create virtual partitions.", ) @pytest.mark.skipif( - InitializeWithSmallQueryCompilers.get(), - reason="SmallQueryCompiler does not contain partitions.", + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler does not contain partitions.", ) @pytest.mark.parametrize( "left_virtual,right_virtual", [(True, False), (False, True), (True, True)] diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index 4c6a9dfde0c..22a0382ea0a 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -23,12 +23,7 @@ from numpy.testing import assert_array_equal import modin.pandas as pd -from modin.config import ( - Engine, - InitializeWithSmallQueryCompilers, - NPartitions, - StorageFormat, -) +from modin.config import Engine, NPartitions, StorageFormat, UsePlainPandasQueryCompiler from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( axis_keys, @@ -95,7 +90,7 @@ def test_ops_defaulting_to_pandas(op, make_args): modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"], axis=1) with ( warns_that_defaulting_to_pandas() - if not InitializeWithSmallQueryCompilers.get() + if not UsePlainPandasQueryCompiler.get() else contextlib.nullcontext() ): operation = getattr(modin_df, op) @@ -113,7 +108,7 @@ def test_style(): data = test_data_values[0] with ( warns_that_defaulting_to_pandas() - if not InitializeWithSmallQueryCompilers.get() + if not UsePlainPandasQueryCompiler.get() else contextlib.nullcontext() ): pd.DataFrame(data).style @@ -125,7 +120,7 @@ def test_to_timestamp(): with ( warns_that_defaulting_to_pandas() - if not InitializeWithSmallQueryCompilers.get() + if not UsePlainPandasQueryCompiler.get() else contextlib.nullcontext() ): df.to_period().to_timestamp() @@ -142,8 +137,8 @@ def test_to_numpy(data): @pytest.mark.skipif( - InitializeWithSmallQueryCompilers.get(), - reason="SmallQueryCompiler does not contain partitions.", + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler does not contain partitions.", ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_partition_to_numpy(data): @@ -158,7 +153,7 @@ def test_asfreq(): df = pd.DataFrame({"s": series}) with ( warns_that_defaulting_to_pandas() - if not InitializeWithSmallQueryCompilers.get() + if not UsePlainPandasQueryCompiler.get() else contextlib.nullcontext() ): # We are only testing that this defaults to pandas, so we will just check for @@ -320,7 +315,7 @@ def test_corr_min_periods(self, min_periods): {"a": [1, np.nan, 3, 4, 5, 6], "b": [1, 2, 1, 4, 5, np.nan]} ) modin_df = pd.concat([modin_df.iloc[:3], modin_df.iloc[3:]]) - if not InitializeWithSmallQueryCompilers.get(): + if not UsePlainPandasQueryCompiler.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( modin_df, pandas_df, lambda df: df.corr(min_periods=min_periods) @@ -340,8 +335,8 @@ def test_corr_non_numeric(self, numeric_only): reason="doesn't make sense for non-partitioned executions", ) @pytest.mark.skipif( - InitializeWithSmallQueryCompilers.get(), - reason="SmallQueryCompiler does not contain partitions.", + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler does not contain partitions.", ) def test_corr_nans_in_different_partitions(self): # NaN in the first partition @@ -655,7 +650,10 @@ def test_pivot(data, index, columns, values, request): or "default-one_column-several_columns_index" in request.node.callspec.id or "default-one_column-one_column_index" in request.node.callspec.id or ( - current_execution in ("BaseOnPython", "HdkOnNative") + ( + current_execution in ("BaseOnPython", "HdkOnNative") + or UsePlainPandasQueryCompiler.get() + ) and index is lib.no_default ) ): @@ -1035,7 +1033,8 @@ def test_resampler_functions_with_arg(rule, axis, method_arg): "DateColumn", marks=pytest.mark.xfail( condition=Engine.get() in ("Ray", "Unidist", "Dask", "Python") - and StorageFormat.get() != "Base", + and StorageFormat.get() != "Base" + and not UsePlainPandasQueryCompiler.get(), reason="https://github.com/modin-project/modin/issues/6399", ), ), diff --git a/modin/tests/pandas/dataframe/test_indexing.py b/modin/tests/pandas/dataframe/test_indexing.py index 6b778aedb6d..54bee0cdd53 100644 --- a/modin/tests/pandas/dataframe/test_indexing.py +++ b/modin/tests/pandas/dataframe/test_indexing.py @@ -22,10 +22,10 @@ import modin.pandas as pd from modin.config import ( - InitializeWithSmallQueryCompilers, MinPartitionSize, NPartitions, StorageFormat, + UsePlainPandasQueryCompiler, ) from modin.pandas.indexing import is_range_like from modin.pandas.testing import assert_index_equal @@ -589,8 +589,8 @@ def test_loc_setting_single_categorical_column(): @pytest.mark.skipif( - InitializeWithSmallQueryCompilers.get(), - reason="SmallQueryCompiler does not currently support IO functions.", + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler does not currently support IO functions.", ) def test_loc_multi_index(): modin_df = pd.read_csv( @@ -1478,7 +1478,7 @@ def test_reindex_multiindex(): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_reset_index(data, test_async_reset_index): modin_df, pandas_df = create_test_dfs(data) - if test_async_reset_index and not InitializeWithSmallQueryCompilers.get(): + if test_async_reset_index and not UsePlainPandasQueryCompiler.get(): modin_df._query_compiler._modin_frame.set_index_cache(None) modin_result = modin_df.reset_index(inplace=False) pandas_result = pandas_df.reset_index(inplace=False) @@ -1486,7 +1486,7 @@ def test_reset_index(data, test_async_reset_index): modin_df_cp = modin_df.copy() pd_df_cp = pandas_df.copy() - if test_async_reset_index and not InitializeWithSmallQueryCompilers.get(): + if test_async_reset_index and not UsePlainPandasQueryCompiler.get(): modin_df._query_compiler._modin_frame.set_index_cache(None) modin_df_cp.reset_index(inplace=True) pd_df_cp.reset_index(inplace=True) @@ -1673,7 +1673,7 @@ def test_reset_index_with_multi_index_no_drop( kwargs["col_level"] = col_level if col_fill != "no_col_fill": kwargs["col_fill"] = col_fill - if test_async_reset_index and not InitializeWithSmallQueryCompilers.get(): + if test_async_reset_index and not UsePlainPandasQueryCompiler.get(): modin_df._query_compiler._modin_frame.set_index_cache(None) eval_general( modin_df, @@ -1792,7 +1792,7 @@ def test_reset_index_with_named_index( ) modin_df.index.name = pandas_df.index.name = index_name df_equals(modin_df, pandas_df) - if test_async_reset_index and not InitializeWithSmallQueryCompilers.get(): + if test_async_reset_index and not UsePlainPandasQueryCompiler.get(): # The change in index is not automatically handled by Modin. See #3941. modin_df.index = modin_df.index modin_df.modin.to_pandas() @@ -1800,7 +1800,7 @@ def test_reset_index_with_named_index( modin_df._query_compiler._modin_frame.set_index_cache(None) df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False)) - if test_async_reset_index and not InitializeWithSmallQueryCompilers.get(): + if test_async_reset_index and not UsePlainPandasQueryCompiler.get(): # The change in index is not automatically handled by Modin. See #3941. modin_df.index = modin_df.index modin_df.modin.to_pandas() @@ -1813,7 +1813,7 @@ def test_reset_index_with_named_index( modin_df = pd.DataFrame(test_data_values[0]) pandas_df = pandas.DataFrame(test_data_values[0]) modin_df.index.name = pandas_df.index.name = index_name - if test_async_reset_index and not InitializeWithSmallQueryCompilers.get(): + if test_async_reset_index and not UsePlainPandasQueryCompiler.get(): # The change in index is not automatically handled by Modin. See #3941. modin_df.index = modin_df.index modin_df._to_pandas() @@ -1836,7 +1836,7 @@ def test_reset_index_with_named_index( def test_reset_index_metadata_update(index, test_async_reset_index): modin_df, pandas_df = create_test_dfs({"col0": [0, 1, 2, 3]}, index=index) modin_df.columns = pandas_df.columns = ["col1"] - if test_async_reset_index and not InitializeWithSmallQueryCompilers.get(): + if test_async_reset_index and not UsePlainPandasQueryCompiler.get(): # The change in index is not automatically handled by Modin. See #3941. modin_df.index = modin_df.index modin_df._to_pandas() @@ -2255,8 +2255,8 @@ def test___setitem__partitions_aligning(): @pytest.mark.skipif( - InitializeWithSmallQueryCompilers.get(), - reason="SmallQueryCompiler does not currently support IO functions.", + UsePlainPandasQueryCompiler.get(), + reason="PlainPandasQueryCompiler does not currently support IO functions.", ) def test___setitem__with_mismatched_partitions(): with ensure_clean(".csv") as fname: diff --git a/modin/tests/pandas/dataframe/test_join_sort.py b/modin/tests/pandas/dataframe/test_join_sort.py index 2a33e0b860d..723aeedd786 100644 --- a/modin/tests/pandas/dataframe/test_join_sort.py +++ b/modin/tests/pandas/dataframe/test_join_sort.py @@ -11,6 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +import contextlib import warnings import matplotlib @@ -19,7 +20,13 @@ import pytest import modin.pandas as pd -from modin.config import Engine, NPartitions, RangePartitioning, StorageFormat +from modin.config import ( + Engine, + NPartitions, + RangePartitioning, + StorageFormat, + UsePlainPandasQueryCompiler, +) from modin.pandas.io import to_pandas from modin.tests.pandas.utils import ( arg_keys, @@ -489,17 +496,20 @@ def setup_cache(): if has_index_cache: modin_df1.index # triggering index materialization modin_df2.index - assert modin_df1._query_compiler._modin_frame.has_index_cache - assert modin_df2._query_compiler._modin_frame.has_index_cache + if not UsePlainPandasQueryCompiler.get(): + assert modin_df1._query_compiler._modin_frame.has_index_cache + assert modin_df2._query_compiler._modin_frame.has_index_cache else: # Propagate deferred indices to partitions # The change in index is not automatically handled by Modin. See #3941. modin_df1.index = modin_df1.index modin_df1._to_pandas() - modin_df1._query_compiler._modin_frame.set_index_cache(None) + if not UsePlainPandasQueryCompiler.get(): + modin_df1._query_compiler._modin_frame.set_index_cache(None) modin_df2.index = modin_df2.index modin_df2._to_pandas() - modin_df2._query_compiler._modin_frame.set_index_cache(None) + if not UsePlainPandasQueryCompiler.get(): + modin_df2._query_compiler._modin_frame.set_index_cache(None) for on in ( ["col_key1", "idx_key1"], @@ -617,7 +627,11 @@ def test_sort_multiindex(sort_remaining): setattr(df, index, new_index) for kwargs in [{"level": 0}, {"axis": 0}, {"axis": 1}]: - with warns_that_defaulting_to_pandas(): + with ( + warns_that_defaulting_to_pandas() + if not UsePlainPandasQueryCompiler.get() + else contextlib.nullcontext() + ): df_equals( modin_df.sort_index(sort_remaining=sort_remaining, **kwargs), pandas_df.sort_index(sort_remaining=sort_remaining, **kwargs), @@ -761,7 +775,7 @@ def test_sort_values_descending_with_only_two_bins(): modin_df = pd.concat([part1, part2]) pandas_df = modin_df._to_pandas() - if StorageFormat.get() == "Pandas": + if StorageFormat.get() == "Pandas" and not UsePlainPandasQueryCompiler.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (2, 1) eval_general( @@ -805,7 +819,7 @@ def test_sort_values_with_one_partition(ascending): np.array([["hello", "goodbye"], ["hello", "Hello"]]) ) - if StorageFormat.get() == "Pandas": + if StorageFormat.get() == "Pandas" and not UsePlainPandasQueryCompiler.get(): assert modin_df._query_compiler._modin_frame._partitions.shape == (1, 1) eval_general( @@ -925,7 +939,7 @@ def test_sort_values_with_only_one_non_na_row_in_partition(ascending, na_positio @pytest.mark.skipif( - Engine.get() not in ("Ray", "Unidist", "Dask"), + Engine.get() not in ("Ray", "Unidist", "Dask") or UsePlainPandasQueryCompiler.get(), reason="We only need to test this case where sort does not default to pandas.", ) def test_sort_values_with_sort_key_on_partition_boundary(): diff --git a/modin/tests/pandas/test_expanding.py b/modin/tests/pandas/test_expanding.py index 85e08595a89..5a962061e47 100644 --- a/modin/tests/pandas/test_expanding.py +++ b/modin/tests/pandas/test_expanding.py @@ -18,7 +18,7 @@ import pytest import modin.pandas as pd -from modin.config import InitializeWithSmallQueryCompilers, NPartitions +from modin.config import NPartitions, UsePlainPandasQueryCompiler from modin.tests.test_utils import warns_that_defaulting_to_pandas from .utils import ( @@ -71,7 +71,7 @@ def test_dataframe(data, min_periods, axis, method, kwargs): def test_dataframe_corr_cov(data, min_periods, axis, method): with ( warns_that_defaulting_to_pandas() - if not InitializeWithSmallQueryCompilers.get() + if not UsePlainPandasQueryCompiler.get() else contextlib.nullcontext() ): eval_general( @@ -87,7 +87,7 @@ def test_dataframe_corr_cov_with_self(method): mdf, pdf = create_test_dfs(test_data["float_nan_data"]) with ( warns_that_defaulting_to_pandas() - if not InitializeWithSmallQueryCompilers.get() + if not UsePlainPandasQueryCompiler.get() else contextlib.nullcontext() ): eval_general( diff --git a/setup.cfg b/setup.cfg index bdf29321673..ab9bf80c882 100644 --- a/setup.cfg +++ b/setup.cfg @@ -75,4 +75,4 @@ exclude_lines = pass [pytest] -addopts = --env=MODIN_SMALL_QUERY_COMPILER=True \ No newline at end of file +addopts = --env=MODIN_USE_PLAIN_PANDAS_QUERY_COMPILER=True \ No newline at end of file