From d54f92790a882c24ab2041e3089284738641a3c6 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 17 Jun 2024 13:12:58 +0200 Subject: [PATCH 01/10] TEST-#7316: Run a subset of CI tests with python 3.10 and 3.11 on a scheduled basis (#7289) Signed-off-by: Anatoly Myachev --- .github/actions/python-only/action.yml | 2 +- .github/workflows/ci-required.yml | 2 +- .github/workflows/ci.yml | 63 ++++++++++++++++++++------ modin/tests/pandas/test_series.py | 7 ++- 4 files changed, 57 insertions(+), 17 deletions(-) diff --git a/.github/actions/python-only/action.yml b/.github/actions/python-only/action.yml index 128519ba32b..3664b4fb751 100644 --- a/.github/actions/python-only/action.yml +++ b/.github/actions/python-only/action.yml @@ -3,7 +3,7 @@ description: "Prepare the environment to run simple tasks" inputs: python-version: description: "Python version to install" - default: "3.9.x" + default: "3.9" runs: using: "composite" diff --git a/.github/workflows/ci-required.yml b/.github/workflows/ci-required.yml index f575945cd85..d7a8e7d07ee 100644 --- a/.github/workflows/ci-required.yml +++ b/.github/workflows/ci-required.yml @@ -28,7 +28,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v5 with: - python-version: "3.9.x" + python-version: "3.9" architecture: "x64" cache: "pip" cache-dependency-path: '**/requirements-doc.txt' diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 68ea8eaac3e..9ec1ca9d22d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,6 +17,9 @@ on: - setup.py - versioneer.py push: + schedule: + - cron: "30 2 * * WED" + - cron: "30 2 * * THU" concurrency: # Cancel other jobs in the same branch. We don't care whether CI passes # on old commits. @@ -26,21 +29,44 @@ env: MODIN_GITHUB_CI: true jobs: + python-filter: + runs-on: ubuntu-latest + outputs: + python-version: ${{ steps.choose.outputs.python-version }} + steps: + - id: choose + run: | + if [[ "${{ github.event.schedule }}" = "30 2 * * WED" ]] + then + echo "python-version=3.10" >> "$GITHUB_OUTPUT" + elif [[ "${{ github.event.schedule }}" = "30 2 * * THU" ]] + then + echo "python-version=3.11" >> "$GITHUB_OUTPUT" + else + echo "python-version=3.9" >> "$GITHUB_OUTPUT" + fi + lint-mypy: + needs: [python-filter] name: lint (mypy) runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: ./.github/actions/python-only + with: + python-version: ${{ needs.python-filter.outputs.python-version }} - run: pip install -r requirements-dev.txt - run: mypy --config-file mypy.ini lint-flake8: + needs: [python-filter] name: lint (flake8) runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: ./.github/actions/python-only + with: + python-version: ${{ needs.python-filter.outputs.python-version }} # NOTE: If you are changing the set of packages installed here, make sure that # the dev requirements match them. - run: pip install flake8 flake8-print flake8-no-implicit-concat @@ -49,6 +75,7 @@ jobs: - run: flake8 modin/ asv_bench/benchmarks scripts/doc_checker.py test-api-and-no-engine: + needs: [python-filter] name: Test API, headers and no-engine mode runs-on: ubuntu-latest defaults: @@ -59,6 +86,7 @@ jobs: - uses: ./.github/actions/mamba-env with: environment-file: requirements/requirements-no-engine.yml + python-version: ${{ needs.python-filter.outputs.python-version }} - run: python -m pytest modin/tests/pandas/test_api.py - run: python -m pytest modin/tests/test_executions_api.py - run: python -m pytest modin/tests/test_headers.py @@ -66,7 +94,7 @@ jobs: - uses: ./.github/actions/upload-coverage test-clean-install: - needs: [lint-flake8] + needs: [lint-flake8, python-filter] strategy: matrix: os: @@ -80,6 +108,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: ./.github/actions/python-only + with: + python-version: ${{ needs.python-filter.outputs.python-version }} - run: python -m pip install -e ".[all]" - name: Ensure Ray and Dask engines start up run: | @@ -94,7 +124,7 @@ jobs: if: matrix.os == 'ubuntu' test-internals: - needs: [lint-flake8] + needs: [lint-flake8, python-filter] runs-on: ubuntu-latest defaults: run: @@ -105,6 +135,7 @@ jobs: - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml + python-version: ${{ needs.python-filter.outputs.python-version }} - name: Internals tests run: python -m pytest modin/tests/core/test_dispatcher.py - run: python -m pytest modin/tests/config @@ -120,7 +151,7 @@ jobs: - uses: ./.github/actions/upload-coverage test-defaults: - needs: [lint-flake8] + needs: [lint-flake8, python-filter] runs-on: ubuntu-latest defaults: run: @@ -130,12 +161,13 @@ jobs: execution: [BaseOnPython] env: MODIN_TEST_DATASET_SIZE: "small" - name: Test ${{ matrix.execution }} execution, Python 3.9 + name: Test ${{ matrix.execution }} execution, Python ${{ needs.python-filter.outputs.python-version }}" steps: - uses: actions/checkout@v4 - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml + python-version: ${{ needs.python-filter.outputs.python-version }} - name: Install HDF5 run: sudo apt update && sudo apt install -y libhdf5-dev - name: xgboost tests @@ -240,7 +272,7 @@ jobs: "${{ steps.filter.outputs.ray }}" "${{ steps.filter.outputs.dask }}" >> $GITHUB_OUTPUT test-all-unidist: - needs: [lint-flake8, execution-filter] + needs: [lint-flake8, execution-filter, python-filter] if: github.event_name == 'push' || needs.execution-filter.outputs.unidist == 'true' runs-on: ubuntu-latest defaults: @@ -248,7 +280,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.9"] + python-version: [ "${{ needs.python-filter.outputs.python-version }}" ] unidist-backend: ["mpi"] env: MODIN_ENGINE: "Unidist" @@ -314,13 +346,13 @@ jobs: - uses: ./.github/actions/upload-coverage test-all: - needs: [lint-flake8, execution-filter] + needs: [lint-flake8, execution-filter, python-filter] strategy: matrix: os: - ubuntu - windows - python-version: ["3.9"] + python-version: [ "${{ needs.python-filter.outputs.python-version }}" ] engine: ${{ fromJSON( github.event_name == 'push' && '["python", "ray", "dask"]' || needs.execution-filter.outputs.engines ) }} test_task: - group_1 @@ -446,14 +478,14 @@ jobs: if: matrix.os == 'windows' test-sanity: - needs: [lint-flake8, execution-filter] + needs: [lint-flake8, execution-filter, python-filter] if: github.event_name == 'pull_request' strategy: matrix: os: - ubuntu - windows - python-version: ["3.9"] + python-version: [ "${{ needs.python-filter.outputs.python-version }}" ] execution: - name: ray shell-ex: "python -m pytest" @@ -579,7 +611,7 @@ jobs: - uses: ./.github/actions/upload-coverage test-experimental: - needs: [lint-flake8] + needs: [lint-flake8, python-filter] runs-on: ubuntu-latest defaults: run: @@ -601,6 +633,7 @@ jobs: - uses: ./.github/actions/mamba-env with: environment-file: environment-dev.yml + python-version: ${{ needs.python-filter.outputs.python-version }} - name: Install HDF5 run: sudo apt update && sudo apt install -y libhdf5-dev - run: python -m pytest -n 2 modin/tests/pandas/dataframe/test_map_metadata.py @@ -610,14 +643,14 @@ jobs: - uses: ./.github/actions/upload-coverage test-spreadsheet: - needs: [lint-flake8] + needs: [lint-flake8, python-filter] runs-on: ubuntu-latest defaults: run: shell: bash -l {0} strategy: matrix: - python-version: ["3.9"] + python-version: [ "${{ needs.python-filter.outputs.python-version }}" ] engine: ["ray", "dask"] env: MODIN_EXPERIMENTAL: "True" @@ -647,7 +680,7 @@ jobs: delete-merged: true upload-coverage: - needs: [merge-coverage-artifacts] + needs: [merge-coverage-artifacts, python-filter] if: always() # we need to run it regardless of some job being skipped, like in PR runs-on: ubuntu-latest defaults: @@ -656,6 +689,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: ./.github/actions/python-only + with: + python-version: ${{ needs.python-filter.outputs.python-version }} - name: Download coverage data uses: actions/download-artifact@v4 with: diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index 8e125abf64d..434496baac5 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -16,6 +16,7 @@ import datetime import itertools import json +import sys import unittest.mock as mock import matplotlib @@ -4879,11 +4880,15 @@ def set_categories(ser): # pandas 2.0.0: Removed setting Categorical.categories directly (GH47834) # Just check the exception + expected_exception = AttributeError("can't set attribute") + if sys.version_info >= (3, 10): + # The exception message varies across different versions of Python + expected_exception = False eval_general( modin_series, pandas_series, set_categories, - expected_exception=AttributeError("can't set attribute"), + expected_exception=expected_exception, ) From 35298c057fd6ad9822d309441ae56684696ffad1 Mon Sep 17 00:00:00 2001 From: Kurt McKee Date: Mon, 17 Jun 2024 18:31:06 -0500 Subject: [PATCH 02/10] Add a Dependabot config to auto-update GitHub action versions (#7318) Signed-off-by: Kurt McKee --- .github/dependabot.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 .github/dependabot.yaml diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml new file mode 100644 index 00000000000..2390d8c809e --- /dev/null +++ b/.github/dependabot.yaml @@ -0,0 +1,10 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + groups: + github-actions: + patterns: + - "*" From 08c1b115d0d9a3630c937237be1152d554c4e984 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 18 Jun 2024 16:18:15 +0200 Subject: [PATCH 03/10] FIX-#7321: Using 'C' engine instead of 'pyarrow' for getting metadata in 'read_csv' (#7322) Signed-off-by: Anatoly Myachev --- modin/core/io/text/text_file_dispatcher.py | 34 ++++++++++++---------- modin/tests/pandas/test_io.py | 2 +- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py index 6f7e68393b9..130fd484ca1 100644 --- a/modin/core/io/text/text_file_dispatcher.py +++ b/modin/core/io/text/text_file_dispatcher.py @@ -216,7 +216,7 @@ def partitioned_file( newline: bytes = None, header_size: int = 0, pre_reading: int = 0, - read_callback_kw: dict = None, + get_metadata_kw: dict = None, ): """ Compute chunk sizes in bytes for every partition. @@ -244,7 +244,7 @@ def partitioned_file( Number of rows, that occupied by header. pre_reading : int, default: 0 Number of rows between header and skipped rows, that should be read. - read_callback_kw : dict, optional + get_metadata_kw : dict, optional Keyword arguments for `cls.read_callback` to compute metadata if needed. This option is not compatible with `pre_reading!=0`. @@ -255,11 +255,11 @@ def partitioned_file( int : partition start read byte int : partition end read byte pandas.DataFrame or None - Dataframe from which metadata can be retrieved. Can be None if `read_callback_kw=None`. + Dataframe from which metadata can be retrieved. Can be None if `get_metadata_kw=None`. """ - if read_callback_kw is not None and pre_reading != 0: + if get_metadata_kw is not None and pre_reading != 0: raise ValueError( - f"Incompatible combination of parameters: {read_callback_kw=}, {pre_reading=}" + f"Incompatible combination of parameters: {get_metadata_kw=}, {pre_reading=}" ) read_rows_counter = 0 outside_quotes = True @@ -297,11 +297,11 @@ def partitioned_file( rows_skipper(skiprows) else: rows_skipper(skiprows) - if read_callback_kw: + if get_metadata_kw: start = f.tell() # For correct behavior, if we want to avoid double skipping rows, # we need to get metadata after skipping. - pd_df_metadata = cls.read_callback(f, **read_callback_kw) + pd_df_metadata = cls.read_callback(f, **get_metadata_kw) f.seek(start) rows_skipper(header_size) @@ -1063,28 +1063,32 @@ def _read(cls, filepath_or_buffer, **kwargs): and (usecols is None or skiprows is None) and pre_reading == 0 ) - read_callback_kw = dict(kwargs, nrows=1, skipfooter=0, index_col=index_col) + get_metadata_kw = dict(kwargs, nrows=1, skipfooter=0, index_col=index_col) + if get_metadata_kw.get("engine", None) == "pyarrow": + # pyarrow engine doesn't support `nrows` option; + # https://github.com/pandas-dev/pandas/issues/38872 can be used to track pyarrow engine features + get_metadata_kw["engine"] = "c" if not can_compute_metadata_while_skipping_rows: pd_df_metadata = cls.read_callback( filepath_or_buffer_md, - **read_callback_kw, + **get_metadata_kw, ) column_names = pd_df_metadata.columns column_widths, num_splits = cls._define_metadata( pd_df_metadata, column_names ) - read_callback_kw = None + get_metadata_kw = None else: - read_callback_kw = dict(read_callback_kw, skiprows=None) + get_metadata_kw = dict(get_metadata_kw, skiprows=None) # `memory_map` doesn't work with file-like object so we can't use it here. # We can definitely skip it without violating the reading logic # since this parameter is intended to optimize reading. # For reading a couple of lines, this is not essential. - read_callback_kw.pop("memory_map", None) + get_metadata_kw.pop("memory_map", None) # These parameters are already used when opening file `f`, # they do not need to be used again. - read_callback_kw.pop("storage_options", None) - read_callback_kw.pop("compression", None) + get_metadata_kw.pop("storage_options", None) + get_metadata_kw.pop("compression", None) with OpenFile( filepath_or_buffer_md, @@ -1110,7 +1114,7 @@ def _read(cls, filepath_or_buffer, **kwargs): newline=newline, header_size=header_size, pre_reading=pre_reading, - read_callback_kw=read_callback_kw, + get_metadata_kw=get_metadata_kw, ) if can_compute_metadata_while_skipping_rows: pd_df_metadata = pd_df_metadata_temp diff --git a/modin/tests/pandas/test_io.py b/modin/tests/pandas/test_io.py index 3088bc1ab65..bade22ef8ec 100644 --- a/modin/tests/pandas/test_io.py +++ b/modin/tests/pandas/test_io.py @@ -653,7 +653,7 @@ def test_read_csv_encoding_976(self, pathlike): # Quoting, Compression parameters tests @pytest.mark.parametrize("compression", ["infer", "gzip", "bz2", "xz", "zip"]) @pytest.mark.parametrize("encoding", [None, "latin8", "utf16"]) - @pytest.mark.parametrize("engine", [None, "python", "c"]) + @pytest.mark.parametrize("engine", [None, "python", "c", "pyarrow"]) def test_read_csv_compression(self, make_csv_file, compression, encoding, engine): unique_filename = make_csv_file(encoding=encoding, compression=compression) expected_exception = None From c647cf4e19ab0ca4b018ab1a091ae5c79174a498 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 19 Jun 2024 13:12:24 +0200 Subject: [PATCH 04/10] FIX-#7320: Bump the github-actions group with 3 updates (#7319) Bumps the github-actions group with 3 updates: [actions/cache](https://github.com/actions/cache), [Slashgear/action-check-pr-title](https://github.com/slashgear/action-check-pr-title) and [github/codeql-action](https://github.com/github/codeql-action). Updates `actions/cache` from 2 to 4 - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](https://github.com/actions/cache/compare/v2...v4) Updates `Slashgear/action-check-pr-title` from 3.0.0 to 4.3.0 - [Release notes](https://github.com/slashgear/action-check-pr-title/releases) - [Commits](https://github.com/slashgear/action-check-pr-title/compare/v3.0.0...v4.3.0) Updates `github/codeql-action` from 2 to 3 - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/cache dependency-type: direct:production update-type: version-update:semver-major dependency-group: github-actions - dependency-name: Slashgear/action-check-pr-title dependency-type: direct:production update-type: version-update:semver-major dependency-group: github-actions - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-major dependency-group: github-actions ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-notebooks.yml | 2 +- .github/workflows/ci-required.yml | 2 +- .github/workflows/codeql.yml | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci-notebooks.yml b/.github/workflows/ci-notebooks.yml index 4fe193c5803..d890a086b9a 100644 --- a/.github/workflows/ci-notebooks.yml +++ b/.github/workflows/ci-notebooks.yml @@ -35,7 +35,7 @@ jobs: activate-environment: modin_on_unidist if: matrix.execution == 'pandas_on_unidist' - name: Cache datasets - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: taxi.csv # update cache only if notebooks require it to be changed diff --git a/.github/workflows/ci-required.yml b/.github/workflows/ci-required.yml index d7a8e7d07ee..d9a1229994c 100644 --- a/.github/workflows/ci-required.yml +++ b/.github/workflows/ci-required.yml @@ -13,7 +13,7 @@ jobs: check-pr-title: runs-on: ubuntu-latest steps: - - uses: Slashgear/action-check-pr-title@v3.0.0 + - uses: Slashgear/action-check-pr-title@v4.3.0 with: # NOTE: If you change the allowed prefixes here, update # the documentation about them in /docs/development/contributing.rst diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 4ba68a246a6..93f99cd0056 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -32,16 +32,16 @@ jobs: uses: actions/checkout@v4 - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} queries: +security-and-quality config-file: ./.github/workflows/codeql/codeql-config.yml - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 with: category: "/language:${{ matrix.language }}" From e9ab99ac83c7843add5bfddb603efcf9ae8af6c1 Mon Sep 17 00:00:00 2001 From: Jayson729 <91502167+Jayson729@users.noreply.github.com> Date: Wed, 19 Jun 2024 08:02:55 -0400 Subject: [PATCH 05/10] FEAT-#6574: UserWarning no longer displayed when Series/DataFrames are small (#7323) Signed-off-by: Jayson Willey <91502167+Jayson729@users.noreply.github.com> --- modin/pandas/dataframe.py | 9 ++++-- modin/pandas/series.py | 28 ++++++++-------- modin/tests/pandas/dataframe/test_default.py | 34 ++++++++++++++++++++ modin/tests/pandas/test_series.py | 14 ++++---- 4 files changed, 61 insertions(+), 24 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index fe28d3680e0..8a80809dd3e 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -205,9 +205,6 @@ def __init__( self._query_compiler = distributed_frame._query_compiler return - warnings.warn( - "Distributing {} object. This may take some time.".format(type(data)) - ) if isinstance(data, pandas.Index): pass elif ( @@ -253,6 +250,12 @@ def __init__( pandas_df = pandas.DataFrame( data=data, index=index, columns=columns, dtype=dtype, copy=copy ) + if pandas_df.size >= 1_000_000: + warnings.warn( + "Distributing {} object. This may take some time.".format( + type(data) + ) + ) self._query_compiler = from_pandas(pandas_df)._query_compiler else: self._query_compiler = query_compiler diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 7818c52654d..76470ab243c 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -137,26 +137,28 @@ def __init__( query_compiler.columns = pandas.Index([MODIN_UNNAMED_SERIES_LABEL]) if query_compiler is None: # Defaulting to pandas - warnings.warn( - "Distributing {} object. This may take some time.".format(type(data)) - ) if name is None: name = MODIN_UNNAMED_SERIES_LABEL if isinstance(data, pandas.Series) and data.name is not None: name = data.name - query_compiler = from_pandas( - pandas.DataFrame( - pandas.Series( - data=data, - index=index, - dtype=dtype, - name=name, - copy=copy, - fastpath=fastpath, + pandas_df = pandas.DataFrame( + pandas.Series( + data=data, + index=index, + dtype=dtype, + name=name, + copy=copy, + fastpath=fastpath, + ) + ) + if pandas_df.size >= 2_500_000: + warnings.warn( + "Distributing {} object. This may take some time.".format( + type(data) ) ) - )._query_compiler + query_compiler = from_pandas(pandas_df)._query_compiler self._query_compiler = query_compiler.columnarize() if name is not None: self.name = name diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py index bad7e54031b..697a0d7f120 100644 --- a/modin/tests/pandas/dataframe/test_default.py +++ b/modin/tests/pandas/dataframe/test_default.py @@ -1473,3 +1473,37 @@ def test_df_from_series_with_tuple_name(): df_equals(pd.DataFrame(pandas.Series(name=("a", 1))), pandas_result) # 2. Creating a Modin DF from Modin Series df_equals(pd.DataFrame(pd.Series(name=("a", 1))), pandas_result) + + +def test_large_df_warns_distributing_takes_time(): + # https://github.com/modin-project/modin/issues/6574 + + regex = r"Distributing (.*) object\. This may take some time\." + with pytest.warns(UserWarning, match=regex): + pd.DataFrame(np.random.randint(1_000_000, size=(100_000, 10))) + + +def test_large_series_warns_distributing_takes_time(): + # https://github.com/modin-project/modin/issues/6574 + + regex = r"Distributing (.*) object\. This may take some time\." + with pytest.warns(UserWarning, match=regex): + pd.Series(np.random.randint(1_000_000, size=(2_500_000))) + + +def test_df_does_not_warn_distributing_takes_time(): + # https://github.com/modin-project/modin/issues/6574 + + regex = r"Distributing (.*) object\. This may take some time\." + with warnings.catch_warnings(): + warnings.filterwarnings("error", regex, UserWarning) + pd.DataFrame(np.random.randint(1_000_000, size=(100_000, 9))) + + +def test_series_does_not_warn_distributing_takes_time(): + # https://github.com/modin-project/modin/issues/6574 + + regex = r"Distributing (.*) object\. This may take some time\." + with warnings.catch_warnings(): + warnings.filterwarnings("error", regex, UserWarning) + pd.Series(np.random.randint(1_000_000, size=(2_400_000))) diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index 434496baac5..9dd8b98aac3 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -18,6 +18,7 @@ import json import sys import unittest.mock as mock +import warnings import matplotlib import numpy as np @@ -26,7 +27,7 @@ import pytest from numpy.testing import assert_array_equal from pandas.core.indexing import IndexingError -from pandas.errors import SpecificationError +from pandas.errors import PerformanceWarning, SpecificationError import modin.pandas as pd from modin.config import Engine, NPartitions, StorageFormat @@ -3429,13 +3430,10 @@ def test_sub(data): def test_6782(): datetime_scalar = datetime.datetime(1970, 1, 1, 0, 0) - with pytest.warns(UserWarning) as warns: - _ = pd.Series([datetime.datetime(2000, 1, 1)]) - datetime_scalar - for warn in warns.list: - assert ( - "Adding/subtracting object-dtype array to DatetimeArray not vectorized" - not in str(warn) - ) + match = "Adding/subtracting object-dtype array to DatetimeArray not vectorized" + with warnings.catch_warnings(): + warnings.filterwarnings("error", match, PerformanceWarning) + pd.Series([datetime.datetime(2000, 1, 1)]) - datetime_scalar @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From ea9a4b0d6e214c018318946ac35154edf374993d Mon Sep 17 00:00:00 2001 From: Vikram Sreekanti Date: Tue, 25 Jun 2024 12:35:17 -0700 Subject: [PATCH 06/10] DOCS-#0000: Adds RunLLM widget to docs (#7326) Signed-off-by: Vikram Sreekanti --- docs/_static/custom.js | 16 ++++++++++++++++ docs/conf.py | 2 ++ 2 files changed, 18 insertions(+) create mode 100644 docs/_static/custom.js diff --git a/docs/_static/custom.js b/docs/_static/custom.js new file mode 100644 index 00000000000..6b867194374 --- /dev/null +++ b/docs/_static/custom.js @@ -0,0 +1,16 @@ +document.addEventListener("DOMContentLoaded", function () { + var script = document.createElement("script"); + script.type = "module"; + script.id = "runllm-widget-script" + + script.src = "https://cdn.jsdelivr.net/npm/@runllm/search-widget@stable/dist/run-llm-search-widget.es.js"; + + script.setAttribute("version", "stable"); + script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. + script.setAttribute("runllm-name", "Modin"); + script.setAttribute("runllm-position", "BOTTOM_RIGHT"); + script.setAttribute("runllm-assistant-id", "164"); + + script.async = true; + document.head.appendChild(script); +}); diff --git a/docs/conf.py b/docs/conf.py index 783eb11e414..cde50b8cadd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -115,6 +115,8 @@ def noop_decorator(*args, **kwargs): # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path . exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] +html_static_path = ["_static"] +html_js_files = ["custom.js"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" From c8bbca8e4e00c681370e3736b2f73bb0352408c3 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 26 Jun 2024 16:28:19 +0200 Subject: [PATCH 07/10] Release version 0.31.0 (#7328) Signed-off-by: Anatoly Myachev From 2eff03cd7db298ed283c6ff124dd6fbc7515f66c Mon Sep 17 00:00:00 2001 From: Kirill Suvorov Date: Thu, 27 Jun 2024 14:52:56 +0200 Subject: [PATCH 08/10] REFACTOR-#0000: Update copyright date (#7333) Signed-off-by: Kirill Suvorov --- NOTICE | 2 +- docs/conf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/NOTICE b/NOTICE index 47193941bf0..77a9b840267 100644 --- a/NOTICE +++ b/NOTICE @@ -1,3 +1,3 @@ Modin -Copyright (c) 2018-2023 Modin Developers. +Copyright (c) 2018-2024 Modin Developers. diff --git a/docs/conf.py b/docs/conf.py index cde50b8cadd..7022f054550 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -61,7 +61,7 @@ def noop_decorator(*args, **kwargs): export_config_help(configs_file_path) project = "Modin" -copyright = "2018-2023, Modin Developers." +copyright = "2018-2024, Modin Developers." author = "Modin contributors" # The short X.Y version From 759d548814a6ac224e83e7531cf98e20b13d85cb Mon Sep 17 00:00:00 2001 From: Iaroslav Igoshev Date: Fri, 28 Jun 2024 10:51:06 +0200 Subject: [PATCH 09/10] FIX-#7329: Do not sort columns on df.update (#7330) Signed-off-by: Igoshev, Iaroslav --- modin/core/dataframe/algebra/binary.py | 4 +++ .../dataframe/pandas/dataframe/dataframe.py | 21 ++++++++++----- .../storage_formats/pandas/query_compiler.py | 2 ++ modin/tests/pandas/dataframe/test_binary.py | 27 +++++++++++++++++++ .../pandas/dataframe/test_map_metadata.py | 1 + 5 files changed, 49 insertions(+), 6 deletions(-) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index b5e701d2d4b..b107089eda5 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -298,6 +298,7 @@ def register( cls, func: Callable[..., pandas.DataFrame], join_type: str = "outer", + sort: bool = None, labels: str = "replace", infer_dtypes: Optional[str] = None, ) -> Callable[..., PandasQueryCompiler]: @@ -310,6 +311,8 @@ def register( Binary function to execute. Have to be able to accept at least two arguments. join_type : {'left', 'right', 'outer', 'inner', None}, default: 'outer' Type of join that will be used if indices of operands are not aligned. + sort : bool, default: None + Whether to sort index and columns or not. labels : {"keep", "replace", "drop"}, default: "replace" Whether keep labels from left Modin DataFrame, replace them with labels from joined DataFrame or drop altogether to make them be computed lazily later. @@ -419,6 +422,7 @@ def caller( lambda x, y: func(x, y, *args, **kwargs), [other._modin_frame], join_type=join_type, + sort=sort, labels=labels, dtypes=dtypes, ), diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index b8467ebe084..5456f28f127 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -3255,7 +3255,6 @@ def broadcast_apply( axis, other, join_type, - sort=not self.get_axis(axis).equals(other.get_axis(axis)), ) # unwrap list returned by `copartition`. right_parts = right_parts[0] @@ -3681,7 +3680,7 @@ def _check_if_axes_identical(self, other: PandasDataframe, axis: int = 0) -> boo ) and self._get_axis_lengths(axis) == other._get_axis_lengths(axis) def _copartition( - self, axis, other, how, sort, force_repartition=False, fill_value=None + self, axis, other, how, sort=None, force_repartition=False, fill_value=None ): """ Copartition two Modin DataFrames. @@ -3696,8 +3695,9 @@ def _copartition( Other Modin DataFrame(s) to copartition against. how : str How to manage joining the index object ("left", "right", etc.). - sort : bool + sort : bool, default: None Whether sort the joined index or not. + If ``None``, sort is defined in depend on labels equality along the axis. force_repartition : bool, default: False Whether force the repartitioning or not. By default, this method will skip repartitioning if it is possible. This is because @@ -3730,6 +3730,9 @@ def _copartition( self._get_axis_lengths_cache(axis), ) + if sort is None: + sort = not all(self.get_axis(axis).equals(o.get_axis(axis)) for o in other) + self_index = self.get_axis(axis) others_index = [o.get_axis(axis) for o in other] joined_index, make_reindexer = self._join_index_objects( @@ -3823,6 +3826,7 @@ def n_ary_op( op, right_frames: list[PandasDataframe], join_type="outer", + sort=None, copartition_along_columns=True, labels="replace", dtypes: Optional[pandas.Series] = None, @@ -3838,6 +3842,8 @@ def n_ary_op( Modin DataFrames to join with. join_type : str, default: "outer" Type of join to apply. + sort : bool, default: None + Whether to sort index and columns or not. copartition_along_columns : bool, default: True Whether to perform copartitioning along columns or not. For some ops this isn't needed (e.g., `fillna`). @@ -3854,7 +3860,10 @@ def n_ary_op( New Modin DataFrame. """ left_parts, list_of_right_parts, joined_index, row_lengths = self._copartition( - 0, right_frames, join_type, sort=True + 0, + right_frames, + join_type, + sort=sort, ) if copartition_along_columns: new_left_frame = self.__constructor__( @@ -3886,7 +3895,7 @@ def n_ary_op( 1, new_right_frames, join_type, - sort=True, + sort=sort, ) else: joined_columns = self.copy_columns_cache(copy_lengths=True) @@ -3978,7 +3987,7 @@ def _compute_new_widths(): joined_index, partition_sizes_along_axis, ) = self._copartition( - axis.value ^ 1, others, how, sort, force_repartition=False + axis.value ^ 1, others, how, sort=sort, force_repartition=False ) if axis == Axis.COL_WISE: new_lengths = partition_sizes_along_axis diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 7c4f7e79f55..9d4467c2085 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -460,6 +460,7 @@ def to_numpy(self, **kwargs): df_update = Binary.register( copy_df_for_func(pandas.DataFrame.update, display_name="update"), join_type="left", + sort=False, ) series_update = Binary.register( copy_df_for_func( @@ -467,6 +468,7 @@ def to_numpy(self, **kwargs): display_name="update", ), join_type="left", + sort=False, ) # Needed for numpy API diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py index e153f9f892f..108e2620aac 100644 --- a/modin/tests/pandas/dataframe/test_binary.py +++ b/modin/tests/pandas/dataframe/test_binary.py @@ -527,3 +527,30 @@ def test_arithmetic_with_tricky_dtypes(val1, val2, op, request): lambda dfs: getattr(dfs[0], op)(dfs[1]), expected_exception=expected_exception, ) + + +@pytest.mark.parametrize( + "data, other_data", + [ + ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}), + ({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}), + ], +) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("match_index", [True, False]) +def test_bin_op_mismatched_columns(data, other_data, axis, match_index): + modin_df, pandas_df = create_test_dfs(data) + other_modin_df, other_pandas_df = create_test_dfs(other_data) + if axis == 0: + if not match_index: + modin_df.index = pandas_df.index = ["1", "2", "3"] + other_modin_df.index = other_pandas_df.index = ["2", "1", "3"] + eval_general( + modin_df, + pandas_df, + lambda df: ( + df.add(other_modin_df, axis=axis) + if isinstance(df, pd.DataFrame) + else df.add(other_pandas_df, axis=axis) + ), + ) diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index 4b19d5fbd9d..d6980cd6761 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -1592,6 +1592,7 @@ def test_transpose(data): "data, other_data", [ ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}), + ({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}), ( {"A": ["a", "b", "c"], "B": ["x", "y", "z"]}, {"B": ["d", "e", "f", "g", "h", "i"]}, From 4e7afa7ea59c7a160ed504f39652ff23b4d49be3 Mon Sep 17 00:00:00 2001 From: Chi-Sheng Liu Date: Wed, 3 Jul 2024 22:22:28 +0800 Subject: [PATCH 10/10] DOCS-#7335: Fix borken links in Modin Usage Examples page (#7336) Signed-off-by: Chi-Sheng Liu --- docs/usage_guide/examples/index.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/usage_guide/examples/index.rst b/docs/usage_guide/examples/index.rst index 0b23b2c838c..2e8cc3fa9b5 100644 --- a/docs/usage_guide/examples/index.rst +++ b/docs/usage_guide/examples/index.rst @@ -16,8 +16,7 @@ The following tutorials cover the basic usage of Modin. `Here `__, `Source PandasOnDask `__] -- Exercise 5: Setting up Modin in a Cluster Environment [`Source PandasOnRay `__] -- Exercise 6: Running Modin in a Cluster Environment [`Source PandasOnRay `__] +- Exercise 5: Setting up Modin in a Cluster Environment [`Source PandasOnRay `__] How to get required dependencies for the tutorial notebooks and to run them please refer to the respective `README.md `__ file. @@ -25,7 +24,7 @@ How to get required dependencies for the tutorial notebooks and to run them plea Data Science Benchmarks ''''''''''''''''''''''' -- Using Modin with the NYC Taxi Dataset [`Source `__] +- Using Modin with the NYC Taxi Dataset [`Source `__] - Using Modin with the Census Dataset (coming soon...) - Using Modin with the Plasticc Dataset (coming soon...)