From 0254080f42fa260c5439b7595a2d8f4cfd70cafb Mon Sep 17 00:00:00 2001 From: Mahesh Vashishtha Date: Mon, 12 Dec 2022 09:38:59 -0600 Subject: [PATCH] FIX-#5308: Allow custom execution with no known engine. (#5379) Signed-off-by: mvashishtha --- .github/workflows/ci.yml | 36 ++++++++++ modin/config/envvars.py | 7 +- .../dispatching/factories/dispatcher.py | 66 +++++++++---------- requirements/requirements-no-engine.yml | 48 ++++++++++++++ 4 files changed, 122 insertions(+), 35 deletions(-) create mode 100644 requirements/requirements-no-engine.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bb68f171b90..ad3383502b1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -330,6 +330,42 @@ jobs: - run: python -m pytest modin/test/test_logging.py - uses: codecov/codecov-action@v2 + test-no-engine: + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 1 + - name: Cache conda + uses: actions/cache@v3 + with: + path: | + ~/conda_pkgs_dir + ~/.cache/pip + key: + ${{ runner.os }}-conda-${{ hashFiles('requirements-no-engine.yml') }} + - uses: conda-incubator/setup-miniconda@v2 + with: + miniforge-variant: Mambaforge + miniforge-version: latest + use-mamba: true + activate-environment: modin + environment-file: requirements/requirements-no-engine.yml + python-version: 3.8 + channel-priority: strict + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false + - name: Conda environment + run: | + conda info + conda list + - run: python -m pytest modin/core/execution/dispatching/factories/test/test_dispatcher.py::test_add_option + - uses: codecov/codecov-action@v2 + test-defaults: needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] runs-on: ubuntu-latest diff --git a/modin/config/envvars.py b/modin/config/envvars.py index d24ae675aaf..d16a463d34f 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -83,6 +83,8 @@ class Engine(EnvironmentVariable, type=str): "Python", } # engines that don't require initialization, useful for unit tests + has_custom_engine = False + @classmethod def _get_default(cls) -> str: """ @@ -98,7 +100,9 @@ def _get_default(cls) -> str: MIN_UNIDIST_VERSION, ) - if IsDebug.get(): + # If there's a custom engine, we don't need to check for any engine + # dependencies. Return the default "Python" engine. + if IsDebug.get() or cls.has_custom_engine: return "Python" try: import ray @@ -160,6 +164,7 @@ def _get_default(cls) -> str: def add_option(cls, choice: Any) -> Any: choice = super().add_option(choice) cls.NOINIT_ENGINES.add(choice) + cls.has_custom_engine = True return choice diff --git a/modin/core/execution/dispatching/factories/dispatcher.py b/modin/core/execution/dispatching/factories/dispatcher.py index ba77bf9e117..0e84efb4a76 100644 --- a/modin/core/execution/dispatching/factories/dispatcher.py +++ b/modin/core/execution/dispatching/factories/dispatcher.py @@ -109,7 +109,9 @@ class FactoryDispatcher(object): @classmethod def get_factory(cls) -> factories.BaseFactory: """Get current factory.""" - # mostly for testing + if cls.__factory is None: + Engine.subscribe(cls._update_factory) + StorageFormat.subscribe(cls._update_factory) return cls.__factory @classmethod @@ -155,152 +157,148 @@ def _update_factory(cls, _): @classmethod @_inherit_docstrings(factories.BaseFactory._from_pandas) def from_pandas(cls, df): - return cls.__factory._from_pandas(df) + return cls.get_factory()._from_pandas(df) @classmethod @_inherit_docstrings(factories.BaseFactory._from_arrow) def from_arrow(cls, at): - return cls.__factory._from_arrow(at) + return cls.get_factory()._from_arrow(at) @classmethod @_inherit_docstrings(factories.BaseFactory._from_non_pandas) def from_non_pandas(cls, *args, **kwargs): - return cls.__factory._from_non_pandas(*args, **kwargs) + return cls.get_factory()._from_non_pandas(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._from_dataframe) def from_dataframe(cls, *args, **kwargs): - return cls.__factory._from_dataframe(*args, **kwargs) + return cls.get_factory()._from_dataframe(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_parquet) def read_parquet(cls, **kwargs): - return cls.__factory._read_parquet(**kwargs) + return cls.get_factory()._read_parquet(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_csv) def read_csv(cls, **kwargs): - return cls.__factory._read_csv(**kwargs) + return cls.get_factory()._read_csv(**kwargs) @classmethod @_inherit_docstrings(factories.ExperimentalPandasOnRayFactory._read_csv_glob) def read_csv_glob(cls, **kwargs): - return cls.__factory._read_csv_glob(**kwargs) + return cls.get_factory()._read_csv_glob(**kwargs) @classmethod @_inherit_docstrings( factories.ExperimentalPandasOnRayFactory._read_pickle_distributed ) def read_pickle_distributed(cls, **kwargs): - return cls.__factory._read_pickle_distributed(**kwargs) + return cls.get_factory()._read_pickle_distributed(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_json) def read_json(cls, **kwargs): - return cls.__factory._read_json(**kwargs) + return cls.get_factory()._read_json(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_gbq) def read_gbq(cls, **kwargs): - return cls.__factory._read_gbq(**kwargs) + return cls.get_factory()._read_gbq(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_html) def read_html(cls, **kwargs): - return cls.__factory._read_html(**kwargs) + return cls.get_factory()._read_html(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_clipboard) def read_clipboard(cls, **kwargs): - return cls.__factory._read_clipboard(**kwargs) + return cls.get_factory()._read_clipboard(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_excel) def read_excel(cls, **kwargs): - return cls.__factory._read_excel(**kwargs) + return cls.get_factory()._read_excel(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_hdf) def read_hdf(cls, **kwargs): - return cls.__factory._read_hdf(**kwargs) + return cls.get_factory()._read_hdf(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_feather) def read_feather(cls, **kwargs): - return cls.__factory._read_feather(**kwargs) + return cls.get_factory()._read_feather(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_stata) def read_stata(cls, **kwargs): - return cls.__factory._read_stata(**kwargs) + return cls.get_factory()._read_stata(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_sas) def read_sas(cls, **kwargs): # pragma: no cover - return cls.__factory._read_sas(**kwargs) + return cls.get_factory()._read_sas(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_pickle) def read_pickle(cls, **kwargs): - return cls.__factory._read_pickle(**kwargs) + return cls.get_factory()._read_pickle(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_sql) def read_sql(cls, **kwargs): - return cls.__factory._read_sql(**kwargs) + return cls.get_factory()._read_sql(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_fwf) def read_fwf(cls, **kwargs): - return cls.__factory._read_fwf(**kwargs) + return cls.get_factory()._read_fwf(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_sql_table) def read_sql_table(cls, **kwargs): - return cls.__factory._read_sql_table(**kwargs) + return cls.get_factory()._read_sql_table(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_sql_query) def read_sql_query(cls, **kwargs): - return cls.__factory._read_sql_query(**kwargs) + return cls.get_factory()._read_sql_query(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_spss) def read_spss(cls, **kwargs): - return cls.__factory._read_spss(**kwargs) + return cls.get_factory()._read_spss(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_sql) def to_sql(cls, *args, **kwargs): - return cls.__factory._to_sql(*args, **kwargs) + return cls.get_factory()._to_sql(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_pickle) def to_pickle(cls, *args, **kwargs): - return cls.__factory._to_pickle(*args, **kwargs) + return cls.get_factory()._to_pickle(*args, **kwargs) @classmethod @_inherit_docstrings( factories.ExperimentalPandasOnRayFactory._to_pickle_distributed ) def to_pickle_distributed(cls, *args, **kwargs): - return cls.__factory._to_pickle_distributed(*args, **kwargs) + return cls.get_factory()._to_pickle_distributed(*args, **kwargs) @classmethod @_inherit_docstrings(factories.ExperimentalPandasOnRayFactory._read_custom_text) def read_custom_text(cls, **kwargs): - return cls.__factory._read_custom_text(**kwargs) + return cls.get_factory()._read_custom_text(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_csv) def to_csv(cls, *args, **kwargs): - return cls.__factory._to_csv(*args, **kwargs) + return cls.get_factory()._to_csv(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_parquet) def to_parquet(cls, *args, **kwargs): - return cls.__factory._to_parquet(*args, **kwargs) - - -Engine.subscribe(FactoryDispatcher._update_factory) -StorageFormat.subscribe(FactoryDispatcher._update_factory) + return cls.get_factory()._to_parquet(*args, **kwargs) diff --git a/requirements/requirements-no-engine.yml b/requirements/requirements-no-engine.yml new file mode 100644 index 00000000000..45f53ac39a9 --- /dev/null +++ b/requirements/requirements-no-engine.yml @@ -0,0 +1,48 @@ +channels: + - conda-forge +dependencies: + - pandas==1.5.2 + - numpy>=1.18.5 + - pyarrow>=4.0.1 + - fsspec + - xarray + - Jinja2 + - scipy + - pip + - s3fs>=2021.8 + - feather-format + - lxml + - openpyxl + - xlrd + - matplotlib + - sqlalchemy>=1.4.0 + - pandas-gbq + - pytables + - msgpack-python + - psutil + - pytest>=6.0.1 + - pytest-benchmark + - pytest-cov>=2.10.1 + - pytest-xdist>=2.1.0 + - coverage + - pygithub + - rpyc==4.1.5 + - cloudpickle + - boto3 + # TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost + # when we use collective instead of rabit. + - xgboost>=1.7.1,<2.0.0 + - tqdm + - pip: + # Fixes breaking ipywidgets changes, but didn't release yet. + - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 + - git+https://github.com/airspeed-velocity/asv.git@ef016e233cb9a0b19d517135104f49e0a3c380e9 + # no conda package for windows + - connectorx>=0.2.6a4 + - black + # TODO: remove when flake8 5.x stabilizes and appears in both pip and conda-forge; see GH-#4745 + - flake8<5 + - flake8-no-implicit-concat + - flake8-print + # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. + - numpydoc==1.1.0