From d54f92790a882c24ab2041e3089284738641a3c6 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoliimyachev@mail.com>
Date: Mon, 17 Jun 2024 13:12:58 +0200
Subject: [PATCH 01/10] TEST-#7316: Run a subset of CI tests with python 3.10
 and 3.11 on a scheduled basis (#7289)

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 .github/actions/python-only/action.yml |  2 +-
 .github/workflows/ci-required.yml      |  2 +-
 .github/workflows/ci.yml               | 63 ++++++++++++++++++++------
 modin/tests/pandas/test_series.py      |  7 ++-
 4 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/.github/actions/python-only/action.yml b/.github/actions/python-only/action.yml
index 128519ba32b..3664b4fb751 100644
--- a/.github/actions/python-only/action.yml
+++ b/.github/actions/python-only/action.yml
@@ -3,7 +3,7 @@ description: "Prepare the environment to run simple tasks"
 inputs:
   python-version:
     description: "Python version to install"
-    default: "3.9.x"
+    default: "3.9"
 
 runs:
   using: "composite"
diff --git a/.github/workflows/ci-required.yml b/.github/workflows/ci-required.yml
index f575945cd85..d7a8e7d07ee 100644
--- a/.github/workflows/ci-required.yml
+++ b/.github/workflows/ci-required.yml
@@ -28,7 +28,7 @@ jobs:
           fetch-depth: 1
       - uses: actions/setup-python@v5
         with:
-          python-version: "3.9.x"
+          python-version: "3.9"
           architecture: "x64"
           cache: "pip"
           cache-dependency-path: '**/requirements-doc.txt'
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 68ea8eaac3e..9ec1ca9d22d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,6 +17,9 @@ on:
       - setup.py
       - versioneer.py
   push:
+  schedule:
+    - cron: "30 2 * * WED"
+    - cron: "30 2 * * THU"
 concurrency:
   # Cancel other jobs in the same branch. We don't care whether CI passes
   # on old commits.
@@ -26,21 +29,44 @@ env:
   MODIN_GITHUB_CI: true
 
 jobs:
+  python-filter:
+    runs-on: ubuntu-latest
+    outputs:
+      python-version: ${{ steps.choose.outputs.python-version }}
+    steps:
+    - id: choose
+      run: |
+        if [[ "${{ github.event.schedule }}" = "30 2 * * WED" ]]
+        then
+          echo "python-version=3.10" >> "$GITHUB_OUTPUT"
+        elif [[ "${{ github.event.schedule }}" = "30 2 * * THU" ]]
+        then
+          echo "python-version=3.11" >> "$GITHUB_OUTPUT"
+        else
+          echo "python-version=3.9" >> "$GITHUB_OUTPUT"
+        fi
+
   lint-mypy:
+    needs: [python-filter]
     name: lint (mypy)
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - uses: ./.github/actions/python-only
+        with:
+          python-version: ${{ needs.python-filter.outputs.python-version }}
       - run: pip install -r requirements-dev.txt
       - run: mypy --config-file mypy.ini
 
   lint-flake8:
+    needs: [python-filter]
     name: lint (flake8)
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - uses: ./.github/actions/python-only
+        with:
+          python-version: ${{ needs.python-filter.outputs.python-version }}
       # NOTE: If you are changing the set of packages installed here, make sure that
       # the dev requirements match them.
       - run: pip install flake8 flake8-print flake8-no-implicit-concat
@@ -49,6 +75,7 @@ jobs:
       - run: flake8 modin/ asv_bench/benchmarks scripts/doc_checker.py
 
   test-api-and-no-engine:
+    needs: [python-filter]
     name: Test API, headers and no-engine mode
     runs-on: ubuntu-latest
     defaults:
@@ -59,6 +86,7 @@ jobs:
       - uses: ./.github/actions/mamba-env
         with:
           environment-file: requirements/requirements-no-engine.yml
+          python-version: ${{ needs.python-filter.outputs.python-version }}
       - run: python -m pytest modin/tests/pandas/test_api.py
       - run: python -m pytest modin/tests/test_executions_api.py
       - run: python -m pytest modin/tests/test_headers.py
@@ -66,7 +94,7 @@ jobs:
       - uses: ./.github/actions/upload-coverage
 
   test-clean-install:
-    needs: [lint-flake8]
+    needs: [lint-flake8, python-filter]
     strategy:
       matrix:
         os:
@@ -80,6 +108,8 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: ./.github/actions/python-only
+        with:
+          python-version: ${{ needs.python-filter.outputs.python-version }}
       - run: python -m pip install -e ".[all]"
       - name: Ensure Ray and Dask engines start up
         run: |
@@ -94,7 +124,7 @@ jobs:
         if: matrix.os == 'ubuntu'
 
   test-internals:
-    needs: [lint-flake8]
+    needs: [lint-flake8, python-filter]
     runs-on: ubuntu-latest
     defaults:
       run:
@@ -105,6 +135,7 @@ jobs:
       - uses: ./.github/actions/mamba-env
         with:
           environment-file: environment-dev.yml
+          python-version: ${{ needs.python-filter.outputs.python-version }}
       - name: Internals tests
         run: python -m pytest modin/tests/core/test_dispatcher.py
       - run: python -m pytest modin/tests/config
@@ -120,7 +151,7 @@ jobs:
       - uses: ./.github/actions/upload-coverage
 
   test-defaults:
-    needs: [lint-flake8]
+    needs: [lint-flake8, python-filter]
     runs-on: ubuntu-latest
     defaults:
       run:
@@ -130,12 +161,13 @@ jobs:
         execution: [BaseOnPython]
     env:
       MODIN_TEST_DATASET_SIZE: "small"
-    name: Test ${{ matrix.execution }} execution, Python 3.9
+    name: Test ${{ matrix.execution }} execution, Python ${{ needs.python-filter.outputs.python-version }}"
     steps:
       - uses: actions/checkout@v4
       - uses: ./.github/actions/mamba-env
         with:
           environment-file: environment-dev.yml
+          python-version: ${{ needs.python-filter.outputs.python-version }}
       - name: Install HDF5
         run: sudo apt update && sudo apt install -y libhdf5-dev
       - name: xgboost tests
@@ -240,7 +272,7 @@ jobs:
               "${{ steps.filter.outputs.ray }}" "${{ steps.filter.outputs.dask }}" >> $GITHUB_OUTPUT
 
   test-all-unidist:
-    needs: [lint-flake8, execution-filter]
+    needs: [lint-flake8, execution-filter, python-filter]
     if: github.event_name == 'push' || needs.execution-filter.outputs.unidist == 'true'
     runs-on: ubuntu-latest
     defaults:
@@ -248,7 +280,7 @@ jobs:
         shell: bash -l {0}
     strategy:
       matrix:
-        python-version: ["3.9"]
+        python-version: [ "${{ needs.python-filter.outputs.python-version }}" ]
         unidist-backend: ["mpi"]
     env:
       MODIN_ENGINE: "Unidist"
@@ -314,13 +346,13 @@ jobs:
       - uses: ./.github/actions/upload-coverage
 
   test-all:
-    needs: [lint-flake8, execution-filter]
+    needs: [lint-flake8, execution-filter, python-filter]
     strategy:
       matrix:
         os:
           - ubuntu
           - windows
-        python-version: ["3.9"]
+        python-version: [ "${{ needs.python-filter.outputs.python-version }}" ]
         engine: ${{ fromJSON( github.event_name == 'push' && '["python", "ray", "dask"]' || needs.execution-filter.outputs.engines ) }}
         test_task:
           - group_1
@@ -446,14 +478,14 @@ jobs:
         if: matrix.os == 'windows'
 
   test-sanity:
-    needs: [lint-flake8, execution-filter]
+    needs: [lint-flake8, execution-filter, python-filter]
     if: github.event_name == 'pull_request'
     strategy:
       matrix:
         os:
           - ubuntu
           - windows
-        python-version: ["3.9"]
+        python-version: [ "${{ needs.python-filter.outputs.python-version }}" ]
         execution:
           - name: ray
             shell-ex: "python -m pytest"
@@ -579,7 +611,7 @@ jobs:
       - uses: ./.github/actions/upload-coverage
 
   test-experimental:
-    needs: [lint-flake8]
+    needs: [lint-flake8, python-filter]
     runs-on: ubuntu-latest
     defaults:
       run:
@@ -601,6 +633,7 @@ jobs:
       - uses: ./.github/actions/mamba-env
         with:
           environment-file: environment-dev.yml
+          python-version: ${{ needs.python-filter.outputs.python-version }}
       - name: Install HDF5
         run: sudo apt update && sudo apt install -y libhdf5-dev
       - run: python -m pytest -n 2 modin/tests/pandas/dataframe/test_map_metadata.py
@@ -610,14 +643,14 @@ jobs:
       - uses: ./.github/actions/upload-coverage
 
   test-spreadsheet:
-    needs: [lint-flake8]
+    needs: [lint-flake8, python-filter]
     runs-on: ubuntu-latest
     defaults:
       run:
         shell: bash -l {0}
     strategy:
       matrix:
-        python-version: ["3.9"]
+        python-version: [ "${{ needs.python-filter.outputs.python-version }}" ]
         engine: ["ray", "dask"]
     env:
       MODIN_EXPERIMENTAL: "True"
@@ -647,7 +680,7 @@ jobs:
           delete-merged: true
 
   upload-coverage:
-    needs: [merge-coverage-artifacts]
+    needs: [merge-coverage-artifacts, python-filter]
     if: always()  # we need to run it regardless of some job being skipped, like in PR
     runs-on: ubuntu-latest
     defaults:
@@ -656,6 +689,8 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: ./.github/actions/python-only
+        with:
+          python-version: ${{ needs.python-filter.outputs.python-version }}
       - name: Download coverage data
         uses: actions/download-artifact@v4
         with:
diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py
index 8e125abf64d..434496baac5 100644
--- a/modin/tests/pandas/test_series.py
+++ b/modin/tests/pandas/test_series.py
@@ -16,6 +16,7 @@
 import datetime
 import itertools
 import json
+import sys
 import unittest.mock as mock
 
 import matplotlib
@@ -4879,11 +4880,15 @@ def set_categories(ser):
 
     # pandas 2.0.0: Removed setting Categorical.categories directly (GH47834)
     # Just check the exception
+    expected_exception = AttributeError("can't set attribute")
+    if sys.version_info >= (3, 10):
+        # The exception message varies across different versions of Python
+        expected_exception = False
     eval_general(
         modin_series,
         pandas_series,
         set_categories,
-        expected_exception=AttributeError("can't set attribute"),
+        expected_exception=expected_exception,
     )
 
 
From 35298c057fd6ad9822d309441ae56684696ffad1 Mon Sep 17 00:00:00 2001
From: Kurt McKee <contactme@kurtmckee.org>
Date: Mon, 17 Jun 2024 18:31:06 -0500
Subject: [PATCH 02/10] Add a Dependabot config to auto-update GitHub action
 versions (#7318)

Signed-off-by: Kurt McKee <contactme@kurtmckee.org>
---
 .github/dependabot.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 .github/dependabot.yaml

diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml
new file mode 100644
index 00000000000..2390d8c809e
--- /dev/null
+++ b/.github/dependabot.yaml
@@ -0,0 +1,10 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"
+    groups:
+      github-actions:
+        patterns:
+          - "*"

From 08c1b115d0d9a3630c937237be1152d554c4e984 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoliimyachev@mail.com>
Date: Tue, 18 Jun 2024 16:18:15 +0200
Subject: [PATCH 03/10] FIX-#7321: Using 'C' engine instead of 'pyarrow' for
 getting metadata in 'read_csv' (#7322)

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/core/io/text/text_file_dispatcher.py | 34 ++++++++++++----------
 modin/tests/pandas/test_io.py              |  2 +-
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py
index 6f7e68393b9..130fd484ca1 100644
--- a/modin/core/io/text/text_file_dispatcher.py
+++ b/modin/core/io/text/text_file_dispatcher.py
@@ -216,7 +216,7 @@ def partitioned_file(
         newline: bytes = None,
         header_size: int = 0,
         pre_reading: int = 0,
-        read_callback_kw: dict = None,
+        get_metadata_kw: dict = None,
     ):
         """
         Compute chunk sizes in bytes for every partition.
@@ -244,7 +244,7 @@ def partitioned_file(
             Number of rows, that occupied by header.
         pre_reading : int, default: 0
             Number of rows between header and skipped rows, that should be read.
-        read_callback_kw : dict, optional
+        get_metadata_kw : dict, optional
             Keyword arguments for `cls.read_callback` to compute metadata if needed.
             This option is not compatible with `pre_reading!=0`.
 
@@ -255,11 +255,11 @@ def partitioned_file(
                 int : partition start read byte
                 int : partition end read byte
         pandas.DataFrame or None
-            Dataframe from which metadata can be retrieved. Can be None if `read_callback_kw=None`.
+            Dataframe from which metadata can be retrieved. Can be None if `get_metadata_kw=None`.
         """
-        if read_callback_kw is not None and pre_reading != 0:
+        if get_metadata_kw is not None and pre_reading != 0:
             raise ValueError(
-                f"Incompatible combination of parameters: {read_callback_kw=}, {pre_reading=}"
+                f"Incompatible combination of parameters: {get_metadata_kw=}, {pre_reading=}"
             )
         read_rows_counter = 0
         outside_quotes = True
@@ -297,11 +297,11 @@ def partitioned_file(
             rows_skipper(skiprows)
         else:
             rows_skipper(skiprows)
-            if read_callback_kw:
+            if get_metadata_kw:
                 start = f.tell()
                 # For correct behavior, if we want to avoid double skipping rows,
                 # we need to get metadata after skipping.
-                pd_df_metadata = cls.read_callback(f, **read_callback_kw)
+                pd_df_metadata = cls.read_callback(f, **get_metadata_kw)
                 f.seek(start)
             rows_skipper(header_size)
 
@@ -1063,28 +1063,32 @@ def _read(cls, filepath_or_buffer, **kwargs):
             and (usecols is None or skiprows is None)
             and pre_reading == 0
         )
-        read_callback_kw = dict(kwargs, nrows=1, skipfooter=0, index_col=index_col)
+        get_metadata_kw = dict(kwargs, nrows=1, skipfooter=0, index_col=index_col)
+        if get_metadata_kw.get("engine", None) == "pyarrow":
+            # pyarrow engine doesn't support `nrows` option;
+            # https://github.com/pandas-dev/pandas/issues/38872 can be used to track pyarrow engine features
+            get_metadata_kw["engine"] = "c"
         if not can_compute_metadata_while_skipping_rows:
             pd_df_metadata = cls.read_callback(
                 filepath_or_buffer_md,
-                **read_callback_kw,
+                **get_metadata_kw,
             )
             column_names = pd_df_metadata.columns
             column_widths, num_splits = cls._define_metadata(
                 pd_df_metadata, column_names
             )
-            read_callback_kw = None
+            get_metadata_kw = None
         else:
-            read_callback_kw = dict(read_callback_kw, skiprows=None)
+            get_metadata_kw = dict(get_metadata_kw, skiprows=None)
             # `memory_map` doesn't work with file-like object so we can't use it here.
             # We can definitely skip it without violating the reading logic
             # since this parameter is intended to optimize reading.
             # For reading a couple of lines, this is not essential.
-            read_callback_kw.pop("memory_map", None)
+            get_metadata_kw.pop("memory_map", None)
             # These parameters are already used when opening file `f`,
             # they do not need to be used again.
-            read_callback_kw.pop("storage_options", None)
-            read_callback_kw.pop("compression", None)
+            get_metadata_kw.pop("storage_options", None)
+            get_metadata_kw.pop("compression", None)
 
         with OpenFile(
             filepath_or_buffer_md,
@@ -1110,7 +1114,7 @@ def _read(cls, filepath_or_buffer, **kwargs):
                 newline=newline,
                 header_size=header_size,
                 pre_reading=pre_reading,
-                read_callback_kw=read_callback_kw,
+                get_metadata_kw=get_metadata_kw,
             )
             if can_compute_metadata_while_skipping_rows:
                 pd_df_metadata = pd_df_metadata_temp
diff --git a/modin/tests/pandas/test_io.py b/modin/tests/pandas/test_io.py
index 3088bc1ab65..bade22ef8ec 100644
--- a/modin/tests/pandas/test_io.py
+++ b/modin/tests/pandas/test_io.py
@@ -653,7 +653,7 @@ def test_read_csv_encoding_976(self, pathlike):
     # Quoting, Compression parameters tests
     @pytest.mark.parametrize("compression", ["infer", "gzip", "bz2", "xz", "zip"])
     @pytest.mark.parametrize("encoding", [None, "latin8", "utf16"])
-    @pytest.mark.parametrize("engine", [None, "python", "c"])
+    @pytest.mark.parametrize("engine", [None, "python", "c", "pyarrow"])
     def test_read_csv_compression(self, make_csv_file, compression, encoding, engine):
         unique_filename = make_csv_file(encoding=encoding, compression=compression)
         expected_exception = None

From c647cf4e19ab0ca4b018ab1a091ae5c79174a498 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 19 Jun 2024 13:12:24 +0200
Subject: [PATCH 04/10] FIX-#7320: Bump the github-actions group with 3 updates
 (#7319)

Bumps the github-actions group with 3 updates: [actions/cache](https://github.com/actions/cache), [Slashgear/action-check-pr-title](https://github.com/slashgear/action-check-pr-title) and [github/codeql-action](https://github.com/github/codeql-action).


Updates `actions/cache` from 2 to 4
- [Release notes](https://github.com/actions/cache/releases)
- [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md)
- [Commits](https://github.com/actions/cache/compare/v2...v4)

Updates `Slashgear/action-check-pr-title` from 3.0.0 to 4.3.0
- [Release notes](https://github.com/slashgear/action-check-pr-title/releases)
- [Commits](https://github.com/slashgear/action-check-pr-title/compare/v3.0.0...v4.3.0)

Updates `github/codeql-action` from 2 to 3
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/v2...v3)

---
updated-dependencies:
- dependency-name: actions/cache
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: github-actions
- dependency-name: Slashgear/action-check-pr-title
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: github-actions
- dependency-name: github/codeql-action
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: github-actions
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/ci-notebooks.yml | 2 +-
 .github/workflows/ci-required.yml  | 2 +-
 .github/workflows/codeql.yml       | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci-notebooks.yml b/.github/workflows/ci-notebooks.yml
index 4fe193c5803..d890a086b9a 100644
--- a/.github/workflows/ci-notebooks.yml
+++ b/.github/workflows/ci-notebooks.yml
@@ -35,7 +35,7 @@ jobs:
           activate-environment: modin_on_unidist
         if: matrix.execution == 'pandas_on_unidist'
       - name: Cache datasets
-        uses: actions/cache@v2
+        uses: actions/cache@v4
         with:
           path: taxi.csv
           # update cache only if notebooks require it to be changed
diff --git a/.github/workflows/ci-required.yml b/.github/workflows/ci-required.yml
index d7a8e7d07ee..d9a1229994c 100644
--- a/.github/workflows/ci-required.yml
+++ b/.github/workflows/ci-required.yml
@@ -13,7 +13,7 @@ jobs:
   check-pr-title:
     runs-on: ubuntu-latest
     steps:
-    - uses: Slashgear/action-check-pr-title@v3.0.0
+    - uses: Slashgear/action-check-pr-title@v4.3.0
       with:
         # NOTE: If you change the allowed prefixes here, update
         # the documentation about them in /docs/development/contributing.rst
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 4ba68a246a6..93f99cd0056 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -32,16 +32,16 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@v2
+        uses: github/codeql-action/init@v3
         with:
           languages: ${{ matrix.language }}
           queries: +security-and-quality
           config-file: ./.github/workflows/codeql/codeql-config.yml
 
       - name: Autobuild
-        uses: github/codeql-action/autobuild@v2
+        uses: github/codeql-action/autobuild@v3
 
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@v2
+        uses: github/codeql-action/analyze@v3
         with:
           category: "/language:${{ matrix.language }}"

From e9ab99ac83c7843add5bfddb603efcf9ae8af6c1 Mon Sep 17 00:00:00 2001
From: Jayson729 <91502167+Jayson729@users.noreply.github.com>
Date: Wed, 19 Jun 2024 08:02:55 -0400
Subject: [PATCH 05/10] FEAT-#6574: UserWarning no longer displayed when
 Series/DataFrames are small (#7323)

Signed-off-by: Jayson Willey <91502167+Jayson729@users.noreply.github.com>
---
 modin/pandas/dataframe.py                    |  9 ++++--
 modin/pandas/series.py                       | 28 ++++++++--------
 modin/tests/pandas/dataframe/test_default.py | 34 ++++++++++++++++++++
 modin/tests/pandas/test_series.py            | 14 ++++----
 4 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
index fe28d3680e0..8a80809dd3e 100644
--- a/modin/pandas/dataframe.py
+++ b/modin/pandas/dataframe.py
@@ -205,9 +205,6 @@ def __init__(
                 self._query_compiler = distributed_frame._query_compiler
                 return
 
-            warnings.warn(
-                "Distributing {} object. This may take some time.".format(type(data))
-            )
             if isinstance(data, pandas.Index):
                 pass
             elif (
@@ -253,6 +250,12 @@ def __init__(
             pandas_df = pandas.DataFrame(
                 data=data, index=index, columns=columns, dtype=dtype, copy=copy
             )
+            if pandas_df.size >= 1_000_000:
+                warnings.warn(
+                    "Distributing {} object. This may take some time.".format(
+                        type(data)
+                    )
+                )
             self._query_compiler = from_pandas(pandas_df)._query_compiler
         else:
             self._query_compiler = query_compiler
diff --git a/modin/pandas/series.py b/modin/pandas/series.py
index 7818c52654d..76470ab243c 100644
--- a/modin/pandas/series.py
+++ b/modin/pandas/series.py
@@ -137,26 +137,28 @@ def __init__(
                 query_compiler.columns = pandas.Index([MODIN_UNNAMED_SERIES_LABEL])
         if query_compiler is None:
             # Defaulting to pandas
-            warnings.warn(
-                "Distributing {} object. This may take some time.".format(type(data))
-            )
             if name is None:
                 name = MODIN_UNNAMED_SERIES_LABEL
                 if isinstance(data, pandas.Series) and data.name is not None:
                     name = data.name
 
-            query_compiler = from_pandas(
-                pandas.DataFrame(
-                    pandas.Series(
-                        data=data,
-                        index=index,
-                        dtype=dtype,
-                        name=name,
-                        copy=copy,
-                        fastpath=fastpath,
+            pandas_df = pandas.DataFrame(
+                pandas.Series(
+                    data=data,
+                    index=index,
+                    dtype=dtype,
+                    name=name,
+                    copy=copy,
+                    fastpath=fastpath,
+                )
+            )
+            if pandas_df.size >= 2_500_000:
+                warnings.warn(
+                    "Distributing {} object. This may take some time.".format(
+                        type(data)
                     )
                 )
-            )._query_compiler
+            query_compiler = from_pandas(pandas_df)._query_compiler
         self._query_compiler = query_compiler.columnarize()
         if name is not None:
             self.name = name
diff --git a/modin/tests/pandas/dataframe/test_default.py b/modin/tests/pandas/dataframe/test_default.py
index bad7e54031b..697a0d7f120 100644
--- a/modin/tests/pandas/dataframe/test_default.py
+++ b/modin/tests/pandas/dataframe/test_default.py
@@ -1473,3 +1473,37 @@ def test_df_from_series_with_tuple_name():
     df_equals(pd.DataFrame(pandas.Series(name=("a", 1))), pandas_result)
     # 2. Creating a Modin DF from Modin Series
     df_equals(pd.DataFrame(pd.Series(name=("a", 1))), pandas_result)
+
+
+def test_large_df_warns_distributing_takes_time():
+    # https://github.com/modin-project/modin/issues/6574
+
+    regex = r"Distributing (.*) object\. This may take some time\."
+    with pytest.warns(UserWarning, match=regex):
+        pd.DataFrame(np.random.randint(1_000_000, size=(100_000, 10)))
+
+
+def test_large_series_warns_distributing_takes_time():
+    # https://github.com/modin-project/modin/issues/6574
+
+    regex = r"Distributing (.*) object\. This may take some time\."
+    with pytest.warns(UserWarning, match=regex):
+        pd.Series(np.random.randint(1_000_000, size=(2_500_000)))
+
+
+def test_df_does_not_warn_distributing_takes_time():
+    # https://github.com/modin-project/modin/issues/6574
+
+    regex = r"Distributing (.*) object\. This may take some time\."
+    with warnings.catch_warnings():
+        warnings.filterwarnings("error", regex, UserWarning)
+        pd.DataFrame(np.random.randint(1_000_000, size=(100_000, 9)))
+
+
+def test_series_does_not_warn_distributing_takes_time():
+    # https://github.com/modin-project/modin/issues/6574
+
+    regex = r"Distributing (.*) object\. This may take some time\."
+    with warnings.catch_warnings():
+        warnings.filterwarnings("error", regex, UserWarning)
+        pd.Series(np.random.randint(1_000_000, size=(2_400_000)))
diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py
index 434496baac5..9dd8b98aac3 100644
--- a/modin/tests/pandas/test_series.py
+++ b/modin/tests/pandas/test_series.py
@@ -18,6 +18,7 @@
 import json
 import sys
 import unittest.mock as mock
+import warnings
 
 import matplotlib
 import numpy as np
@@ -26,7 +27,7 @@
 import pytest
 from numpy.testing import assert_array_equal
 from pandas.core.indexing import IndexingError
-from pandas.errors import SpecificationError
+from pandas.errors import PerformanceWarning, SpecificationError
 
 import modin.pandas as pd
 from modin.config import Engine, NPartitions, StorageFormat
@@ -3429,13 +3430,10 @@ def test_sub(data):
 
 def test_6782():
     datetime_scalar = datetime.datetime(1970, 1, 1, 0, 0)
-    with pytest.warns(UserWarning) as warns:
-        _ = pd.Series([datetime.datetime(2000, 1, 1)]) - datetime_scalar
-        for warn in warns.list:
-            assert (
-                "Adding/subtracting object-dtype array to DatetimeArray not vectorized"
-                not in str(warn)
-            )
+    match = "Adding/subtracting object-dtype array to DatetimeArray not vectorized"
+    with warnings.catch_warnings():
+        warnings.filterwarnings("error", match, PerformanceWarning)
+        pd.Series([datetime.datetime(2000, 1, 1)]) - datetime_scalar
 
 
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)

From ea9a4b0d6e214c018318946ac35154edf374993d Mon Sep 17 00:00:00 2001
From: Vikram Sreekanti <vsreekanti@gmail.com>
Date: Tue, 25 Jun 2024 12:35:17 -0700
Subject: [PATCH 06/10] DOCS-#0000: Adds RunLLM widget to docs (#7326)

Signed-off-by: Vikram Sreekanti <vsreekanti@gmail.com>
---
 docs/_static/custom.js | 16 ++++++++++++++++
 docs/conf.py           |  2 ++
 2 files changed, 18 insertions(+)
 create mode 100644 docs/_static/custom.js

diff --git a/docs/_static/custom.js b/docs/_static/custom.js
new file mode 100644
index 00000000000..6b867194374
--- /dev/null
+++ b/docs/_static/custom.js
@@ -0,0 +1,16 @@
+document.addEventListener("DOMContentLoaded", function () {
+  var script = document.createElement("script");
+  script.type = "module";
+  script.id = "runllm-widget-script"
+
+  script.src = "https://cdn.jsdelivr.net/npm/@runllm/search-widget@stable/dist/run-llm-search-widget.es.js";
+
+  script.setAttribute("version", "stable");
+  script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
+  script.setAttribute("runllm-name", "Modin");
+  script.setAttribute("runllm-position", "BOTTOM_RIGHT");
+  script.setAttribute("runllm-assistant-id", "164");
+
+  script.async = true;
+  document.head.appendChild(script);
+});
diff --git a/docs/conf.py b/docs/conf.py
index 783eb11e414..cde50b8cadd 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -115,6 +115,8 @@ def noop_decorator(*args, **kwargs):
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path .
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+html_static_path = ["_static"]
+html_js_files = ["custom.js"]
 
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = "sphinx"

From c8bbca8e4e00c681370e3736b2f73bb0352408c3 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoliimyachev@mail.com>
Date: Wed, 26 Jun 2024 16:28:19 +0200
Subject: [PATCH 07/10] Release version 0.31.0 (#7328)

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>

From 2eff03cd7db298ed283c6ff124dd6fbc7515f66c Mon Sep 17 00:00:00 2001
From: Kirill Suvorov <kirill.suvorov@intel.com>
Date: Thu, 27 Jun 2024 14:52:56 +0200
Subject: [PATCH 08/10] REFACTOR-#0000: Update copyright date (#7333)

Signed-off-by: Kirill Suvorov <kirill.suvorov@intel.com>
---
 NOTICE       | 2 +-
 docs/conf.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/NOTICE b/NOTICE
index 47193941bf0..77a9b840267 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,3 +1,3 @@
 Modin
 
-Copyright (c) 2018-2023 Modin Developers.
+Copyright (c) 2018-2024 Modin Developers.
diff --git a/docs/conf.py b/docs/conf.py
index cde50b8cadd..7022f054550 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -61,7 +61,7 @@ def noop_decorator(*args, **kwargs):
 export_config_help(configs_file_path)
 
 project = "Modin"
-copyright = "2018-2023, Modin Developers."
+copyright = "2018-2024, Modin Developers."
 author = "Modin contributors"
 
 # The short X.Y version

From 759d548814a6ac224e83e7531cf98e20b13d85cb Mon Sep 17 00:00:00 2001
From: Iaroslav Igoshev <iaroslav.igoshev@intel.com>
Date: Fri, 28 Jun 2024 10:51:06 +0200
Subject: [PATCH 09/10] FIX-#7329: Do not sort columns on df.update (#7330)

Signed-off-by: Igoshev, Iaroslav <iaroslav.igoshev@intel.com>
---
 modin/core/dataframe/algebra/binary.py        |  4 +++
 .../dataframe/pandas/dataframe/dataframe.py   | 21 ++++++++++-----
 .../storage_formats/pandas/query_compiler.py  |  2 ++
 modin/tests/pandas/dataframe/test_binary.py   | 27 +++++++++++++++++++
 .../pandas/dataframe/test_map_metadata.py     |  1 +
 5 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py
index b5e701d2d4b..b107089eda5 100644
--- a/modin/core/dataframe/algebra/binary.py
+++ b/modin/core/dataframe/algebra/binary.py
@@ -298,6 +298,7 @@ def register(
         cls,
         func: Callable[..., pandas.DataFrame],
         join_type: str = "outer",
+        sort: bool = None,
         labels: str = "replace",
         infer_dtypes: Optional[str] = None,
     ) -> Callable[..., PandasQueryCompiler]:
@@ -310,6 +311,8 @@ def register(
             Binary function to execute. Have to be able to accept at least two arguments.
         join_type : {'left', 'right', 'outer', 'inner', None}, default: 'outer'
             Type of join that will be used if indices of operands are not aligned.
+        sort : bool, default: None
+            Whether to sort index and columns or not.
         labels : {"keep", "replace", "drop"}, default: "replace"
             Whether keep labels from left Modin DataFrame, replace them with labels
             from joined DataFrame or drop altogether to make them be computed lazily later.
@@ -419,6 +422,7 @@ def caller(
                             lambda x, y: func(x, y, *args, **kwargs),
                             [other._modin_frame],
                             join_type=join_type,
+                            sort=sort,
                             labels=labels,
                             dtypes=dtypes,
                         ),
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index b8467ebe084..5456f28f127 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -3255,7 +3255,6 @@ def broadcast_apply(
                 axis,
                 other,
                 join_type,
-                sort=not self.get_axis(axis).equals(other.get_axis(axis)),
             )
             # unwrap list returned by `copartition`.
             right_parts = right_parts[0]
@@ -3681,7 +3680,7 @@ def _check_if_axes_identical(self, other: PandasDataframe, axis: int = 0) -> boo
         ) and self._get_axis_lengths(axis) == other._get_axis_lengths(axis)
 
     def _copartition(
-        self, axis, other, how, sort, force_repartition=False, fill_value=None
+        self, axis, other, how, sort=None, force_repartition=False, fill_value=None
     ):
         """
         Copartition two Modin DataFrames.
@@ -3696,8 +3695,9 @@ def _copartition(
             Other Modin DataFrame(s) to copartition against.
         how : str
             How to manage joining the index object ("left", "right", etc.).
-        sort : bool
+        sort : bool, default: None
             Whether sort the joined index or not.
+            If ``None``, sort is defined in depend on labels equality along the axis.
         force_repartition : bool, default: False
             Whether force the repartitioning or not. By default,
             this method will skip repartitioning if it is possible. This is because
@@ -3730,6 +3730,9 @@ def _copartition(
                 self._get_axis_lengths_cache(axis),
             )
 
+        if sort is None:
+            sort = not all(self.get_axis(axis).equals(o.get_axis(axis)) for o in other)
+
         self_index = self.get_axis(axis)
         others_index = [o.get_axis(axis) for o in other]
         joined_index, make_reindexer = self._join_index_objects(
@@ -3823,6 +3826,7 @@ def n_ary_op(
         op,
         right_frames: list[PandasDataframe],
         join_type="outer",
+        sort=None,
         copartition_along_columns=True,
         labels="replace",
         dtypes: Optional[pandas.Series] = None,
@@ -3838,6 +3842,8 @@ def n_ary_op(
             Modin DataFrames to join with.
         join_type : str, default: "outer"
             Type of join to apply.
+        sort : bool, default: None
+            Whether to sort index and columns or not.
         copartition_along_columns : bool, default: True
             Whether to perform copartitioning along columns or not.
             For some ops this isn't needed (e.g., `fillna`).
@@ -3854,7 +3860,10 @@ def n_ary_op(
             New Modin DataFrame.
         """
         left_parts, list_of_right_parts, joined_index, row_lengths = self._copartition(
-            0, right_frames, join_type, sort=True
+            0,
+            right_frames,
+            join_type,
+            sort=sort,
         )
         if copartition_along_columns:
             new_left_frame = self.__constructor__(
@@ -3886,7 +3895,7 @@ def n_ary_op(
                 1,
                 new_right_frames,
                 join_type,
-                sort=True,
+                sort=sort,
             )
         else:
             joined_columns = self.copy_columns_cache(copy_lengths=True)
@@ -3978,7 +3987,7 @@ def _compute_new_widths():
                 joined_index,
                 partition_sizes_along_axis,
             ) = self._copartition(
-                axis.value ^ 1, others, how, sort, force_repartition=False
+                axis.value ^ 1, others, how, sort=sort, force_repartition=False
             )
             if axis == Axis.COL_WISE:
                 new_lengths = partition_sizes_along_axis
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index 7c4f7e79f55..9d4467c2085 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -460,6 +460,7 @@ def to_numpy(self, **kwargs):
     df_update = Binary.register(
         copy_df_for_func(pandas.DataFrame.update, display_name="update"),
         join_type="left",
+        sort=False,
     )
     series_update = Binary.register(
         copy_df_for_func(
@@ -467,6 +468,7 @@ def to_numpy(self, **kwargs):
             display_name="update",
         ),
         join_type="left",
+        sort=False,
     )
 
     # Needed for numpy API
diff --git a/modin/tests/pandas/dataframe/test_binary.py b/modin/tests/pandas/dataframe/test_binary.py
index e153f9f892f..108e2620aac 100644
--- a/modin/tests/pandas/dataframe/test_binary.py
+++ b/modin/tests/pandas/dataframe/test_binary.py
@@ -527,3 +527,30 @@ def test_arithmetic_with_tricky_dtypes(val1, val2, op, request):
         lambda dfs: getattr(dfs[0], op)(dfs[1]),
         expected_exception=expected_exception,
     )
+
+
+@pytest.mark.parametrize(
+    "data, other_data",
+    [
+        ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}),
+        ({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}),
+    ],
+)
+@pytest.mark.parametrize("axis", [0, 1])
+@pytest.mark.parametrize("match_index", [True, False])
+def test_bin_op_mismatched_columns(data, other_data, axis, match_index):
+    modin_df, pandas_df = create_test_dfs(data)
+    other_modin_df, other_pandas_df = create_test_dfs(other_data)
+    if axis == 0:
+        if not match_index:
+            modin_df.index = pandas_df.index = ["1", "2", "3"]
+            other_modin_df.index = other_pandas_df.index = ["2", "1", "3"]
+    eval_general(
+        modin_df,
+        pandas_df,
+        lambda df: (
+            df.add(other_modin_df, axis=axis)
+            if isinstance(df, pd.DataFrame)
+            else df.add(other_pandas_df, axis=axis)
+        ),
+    )
diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py
index 4b19d5fbd9d..d6980cd6761 100644
--- a/modin/tests/pandas/dataframe/test_map_metadata.py
+++ b/modin/tests/pandas/dataframe/test_map_metadata.py
@@ -1592,6 +1592,7 @@ def test_transpose(data):
     "data, other_data",
     [
         ({"A": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "C": [7, 8, 9]}),
+        ({"C": [1, 2, 3], "B": [400, 500, 600]}, {"B": [4, 5, 6], "A": [7, 8, 9]}),
         (
             {"A": ["a", "b", "c"], "B": ["x", "y", "z"]},
             {"B": ["d", "e", "f", "g", "h", "i"]},

From 4e7afa7ea59c7a160ed504f39652ff23b4d49be3 Mon Sep 17 00:00:00 2001
From: Chi-Sheng Liu <chishengliu@chishengliu.com>
Date: Wed, 3 Jul 2024 22:22:28 +0800
Subject: [PATCH 10/10] DOCS-#7335: Fix borken links in Modin Usage Examples
 page (#7336)

Signed-off-by: Chi-Sheng Liu <chishengliu@chishengliu.com>
---
 docs/usage_guide/examples/index.rst | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/usage_guide/examples/index.rst b/docs/usage_guide/examples/index.rst
index 0b23b2c838c..2e8cc3fa9b5 100644
--- a/docs/usage_guide/examples/index.rst
+++ b/docs/usage_guide/examples/index.rst
@@ -16,8 +16,7 @@ The following tutorials cover the basic usage of Modin. `Here <https://www.youtu
 The following tutorials covers more advanced features in Modin:
 
 - Exercise 4: Experimental Features in Modin (Spreadsheet, Progress Bar) [`Source PandasOnRay <https://github.com/modin-project/modin/blob/main/examples/tutorial/jupyter/execution/pandas_on_ray/local/exercise_4.ipynb>`__, `Source PandasOnDask <https://github.com/modin-project/modin/blob/main/examples/tutorial/jupyter/execution/pandas_on_dask/local/exercise_4.ipynb>`__]
-- Exercise 5: Setting up Modin in a Cluster Environment [`Source PandasOnRay <https://github.com/modin-project/modin/blob/main/examples/tutorial/jupyter/execution/pandas_on_ray/cluster/exercise_5.ipynb>`__]
-- Exercise 6: Running Modin in a Cluster Environment [`Source PandasOnRay <https://github.com/modin-project/modin/blob/main/examples/tutorial/jupyter/execution/pandas_on_ray/cluster/exercise_6.ipynb>`__]
+- Exercise 5: Setting up Modin in a Cluster Environment [`Source PandasOnRay <https://github.com/modin-project/modin/blob/main/examples/tutorial/jupyter/execution/pandas_on_ray/cluster/exercise_5.py>`__]
 
 How to get required dependencies for the tutorial notebooks and to run them please refer to the respective `README.md <https://github.com/modin-project/modin/tree/main/examples/tutorial/jupyter/README.md>`__ file.
 
@@ -25,7 +24,7 @@ How to get required dependencies for the tutorial notebooks and to run them plea
 Data Science Benchmarks
 '''''''''''''''''''''''
 
-- Using Modin with the NYC Taxi Dataset [`Source <https://github.com/modin-project/modin/blob/main/examples/jupyter/NYC_Taxi.ipynb>`__]
+- Using Modin with the NYC Taxi Dataset [`Source <https://github.com/modin-project/modin/blob/main/examples/jupyter/Modin_Taxi.ipynb>`__]
 - Using Modin with the Census Dataset (coming soon...)
 - Using Modin with the Plasticc Dataset (coming soon...)