modin-project
diff --git a/‎.github/actions/run-core-tests/group_2/action.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/actions/run-core-tests/group_2/action.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/actions/run-core-tests/group_3/action.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/run-core-tests/group_3/action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci-notebooks.yml‎
Lines changed: 15 additions & 9 deletions b/‎.github/workflows/ci-notebooks.yml‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎.github/workflows/ci-required.yml‎
Lines changed: 12 additions & 3 deletions b/‎.github/workflows/ci-required.yml‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 42 additions & 61 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 42 additions & 61 deletions
@@ -20,3 +20,5 @@ runs:
                                                       modin/pandas/test/dataframe/test_pickle.py
           echo "::endgroup::"
         shell: bash -l {0}
+      - run: MODIN_RANGE_PARTITIONING=1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py -k "merge"
+        shell: bash -l {0}
@@ -19,6 +19,6 @@ runs:
         shell: bash -l {0}
       - run: |
           echo "::group::Running experimental groupby tests (group 3)..."
-          MODIN_EXPERIMENTAL_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py
+          MODIN_RANGE_PARTITIONING_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py
           echo "::endgroup::"
         shell: bash -l {0}
@@ -8,6 +8,7 @@ on:
       - setup.cfg
       - setup.py
       - requirements/env_hdk.yml
+      - requirements/env_unidist_linux.yml
 concurrency:
   # Cancel other jobs in the same branch. We don't care whether CI passes
   # on old commits.
@@ -28,12 +29,17 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - uses: ./.github/actions/python-only
-        if: matrix.execution != 'hdk_on_native'
+        if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
       - uses: ./.github/actions/mamba-env
         with:
           environment-file: requirements/env_hdk.yml
           activate-environment: modin_on_hdk
         if: matrix.execution == 'hdk_on_native'
+      - uses: ./.github/actions/mamba-env
+        with:
+          environment-file: requirements/env_unidist_linux.yml
+          activate-environment: modin_on_unidist
+        if: matrix.execution == 'pandas_on_unidist'
       - name: Cache datasets
         uses: actions/cache@v2
         with:
@@ -43,29 +49,29 @@ jobs:
       # replace modin with . in the tutorial requirements file for `pandas_on_ray` and
       # `pandas_on_dask` since we need Modin built from sources
       - run: sed -i 's/modin/./g' examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt
-        if: matrix.execution != 'hdk_on_native'
+        if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
       # install dependencies required for notebooks execution for `pandas_on_ray` and `pandas_on_dask`
       # Override modin-spreadsheet install for now
       - run: |
           pip install -r examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt
           pip install git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
-        if: matrix.execution != 'hdk_on_native'
-      # Build Modin from sources for `hdk_on_native`
+        if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
+      # Build Modin from sources for `hdk_on_native` and `pandas_on_unidist`
       - run: pip install -e .
-        if: matrix.execution == 'hdk_on_native'
+        if: matrix.execution == 'hdk_on_native' || matrix.execution == 'pandas_on_unidist'
       # install test dependencies
       # NOTE: If you are changing the set of packages installed here, make sure that
       # the dev requirements match them.
       - run: pip install pytest pytest-cov black flake8 flake8-print flake8-no-implicit-concat
-        if: matrix.execution != 'hdk_on_native'
+        if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
       - run: pip install flake8-print jupyter nbformat nbconvert
-        if: matrix.execution == 'hdk_on_native'
+        if: matrix.execution == 'hdk_on_native' || matrix.execution == 'pandas_on_unidist'
       - run: pip list
-        if: matrix.execution != 'hdk_on_native'
+        if: matrix.execution != 'hdk_on_native' && matrix.execution != 'pandas_on_unidist'
       - run: |
           conda info
           conda list
-        if: matrix.execution == 'hdk_on_native'
+        if: matrix.execution == 'hdk_on_native' || matrix.execution == 'pandas_on_unidist'
       # setup kernel configuration for `pandas_on_unidist` execution with mpi backend
       - run: python examples/tutorial/jupyter/execution/${{ matrix.execution }}/setup_kernel.py
         if: matrix.execution == 'pandas_on_unidist'
 
@@ -66,8 +66,6 @@ jobs:
             asv_bench/benchmarks/__init__.py asv_bench/benchmarks/io/__init__.py \
             asv_bench/benchmarks/scalability/__init__.py \
             modin/core/io \
-            modin/experimental/core/execution/ray/implementations/pandas_on_ray \
-            modin/experimental/core/execution/ray/implementations/pyarrow_on_ray \
             modin/pandas/series.py \
             modin/core/execution/python \
             modin/pandas/dataframe.py \
@@ -91,7 +89,6 @@ jobs:
           python scripts/doc_checker.py modin/experimental/pandas/io.py \
             modin/experimental/pandas/__init__.py
       - run: python scripts/doc_checker.py modin/core/storage_formats/base
-      - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/pyarrow
       - run: python scripts/doc_checker.py modin/core/storage_formats/pandas
       - run: |
           python scripts/doc_checker.py \
@@ -108,3 +105,15 @@ jobs:
       - run: python scripts/doc_checker.py modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol
       - run: python scripts/doc_checker.py modin/experimental/batch/pipeline.py
       - run: python scripts/doc_checker.py modin/logging
+
+  lint-black-isort:
+    name: lint (black and isort)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: ./.github/actions/python-only
+      - run: pip install black>=24.1.0 isort>=5.12
+      # NOTE: keep the black command here in sync with the pre-commit hook in
+      # /contributing/pre-commit
+      - run: black --check --diff modin/ asv_bench/benchmarks scripts/doc_checker.py
+      - run: isort . --check-only
@@ -26,17 +26,6 @@ env:
   MODIN_GITHUB_CI: true
 
 jobs:
-  lint-black:
-    name: lint (black)
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: ./.github/actions/python-only
-      - run: pip install black
-      # NOTE: keep the black command here in sync with the pre-commit hook in
-      # /contributing/pre-commit
-      - run: black --check --diff modin/ asv_bench/benchmarks scripts/doc_checker.py
-
   lint-mypy:
     name: lint (mypy)
     runs-on: ubuntu-latest
@@ -77,7 +66,7 @@ jobs:
       - uses: ./.github/actions/upload-coverage
 
   test-clean-install:
-    needs: [lint-flake8, lint-black]
+    needs: [lint-flake8]
     strategy:
       matrix:
         os:
@@ -92,14 +81,20 @@ jobs:
       - uses: actions/checkout@v3
       - uses: ./.github/actions/python-only
       - run: python -m pip install -e ".[all]"
-      - name: Ensure all engines start up
+      - name: Ensure Ray and Dask engines start up
         run: |
           MODIN_ENGINE=dask python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
           MODIN_ENGINE=ray python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
+      - name: Ensure MPI engine start up
+        # Install a working MPI implementation beforehand so mpi4py can link to it
+        run: |
+          sudo apt install libmpich-dev
+          python -m pip install -e ".[mpi]"
           MODIN_ENGINE=unidist UNIDIST_BACKEND=mpi mpiexec -n 1 python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
+        if: matrix.os == 'ubuntu'
 
   test-internals:
-    needs: [lint-flake8, lint-black]
+    needs: [lint-flake8]
     runs-on: ubuntu-latest
     defaults:
       run:
@@ -124,7 +119,7 @@ jobs:
       - uses: ./.github/actions/upload-coverage
 
   test-defaults:
-    needs: [lint-flake8, lint-black]
+    needs: [lint-flake8]
     runs-on: ubuntu-latest
     defaults:
       run:
@@ -155,7 +150,7 @@ jobs:
       - uses: ./.github/actions/upload-coverage
 
   test-hdk:
-    needs: [lint-flake8, lint-black]
+    needs: [lint-flake8]
     runs-on: ubuntu-latest
     defaults:
       run:
@@ -193,6 +188,7 @@ jobs:
       - run: python -m pytest modin/pandas/test/dataframe/test_binary.py
       - run: python -m pytest modin/pandas/test/dataframe/test_reduce.py
       - run: python -m pytest modin/pandas/test/dataframe/test_join_sort.py
+      - run: MODIN_RANGE_PARTITIONING=1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py -k "merge"
       - run: python -m pytest modin/pandas/test/test_general.py
       - run: python -m pytest modin/pandas/test/dataframe/test_indexing.py
       - run: python -m pytest modin/pandas/test/test_series.py
@@ -212,7 +208,7 @@ jobs:
 
   test-asv-benchmarks:
     if: github.event_name == 'pull_request'
-    needs: [lint-flake8, lint-black]
+    needs: [lint-flake8]
     runs-on: ubuntu-latest
     defaults:
       run:
@@ -256,11 +252,6 @@ jobs:
               MODIN_ASV_USE_IMPL=pandas asv run --quick --strict --show-stderr --launch-method=spawn \
                 -b ^benchmarks -b ^io | tee benchmarks.log
 
-              # HDK: ERR_OUT_OF_CPU_MEM: Not enough host memory to execute the query (MODIN#4270)
-              # just disable test for testing - it works well in a machine with more memory
-              sed -i 's/def time_groupby_agg_nunique(self, \*args, \*\*kwargs):/# def time_groupby_agg_nunique(self, *args, **kwargs):/g' benchmarks/hdk/benchmarks.py
-              sed -i 's/execute(self.df.groupby(by=self.groupby_columns).agg("nunique"))/# execute(self.df.groupby(by=self.groupby_columns).agg("nunique"))/g' benchmarks/hdk/benchmarks.py
-
               # Otherwise, ASV considers that the environment has already been created, although ASV command is run for another config,
               # which requires the creation of a completely new environment. This step will be required after removing the manual environment setup step.
               rm -f -R .asv/env/
@@ -322,7 +313,7 @@ jobs:
               "${{ steps.filter.outputs.ray }}" "${{ steps.filter.outputs.dask }}" >> $GITHUB_OUTPUT
 
   test-all-unidist:
-    needs: [lint-flake8, lint-black, execution-filter]
+    needs: [lint-flake8, execution-filter]
     if: github.event_name == 'push' || needs.execution-filter.outputs.unidist == 'true'
     runs-on: ubuntu-latest
     defaults:
@@ -353,7 +344,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: ./.github/actions/mamba-env
         with:
-          environment-file: requirements/env_unidist.yml
+          environment-file: requirements/env_unidist_linux.yml
           activate-environment: modin_on_unidist
           python-version: ${{matrix.python-version}}
       - name: Install HDF5
@@ -376,8 +367,18 @@ jobs:
       - run: ./.github/workflows/sql_server/set_up_sql_server.sh
       # need an extra argument "genv" to set environment variables for mpiexec. We need
       # these variables to test writing to the mock s3 filesystem.
-      - run: mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest modin/pandas/test/test_io.py --verbose
-      - run: mpiexec -n 1 python -m pytest modin/experimental/pandas/test/test_io_exp.py
+      - uses: nick-fields/retry@v2
+        # to avoid issues with non-stable `to_csv` tests for unidist on MPI backend.
+        # for details see: https://github.com/modin-project/modin/pull/6776
+        with:
+          timeout_minutes: 15
+          max_attempts: 3
+          command: |
+            conda run --no-capture-output -n modin_on_unidist mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key \
+              -genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest modin/pandas/test/test_io.py --verbose
+      - run: |
+          mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret \
+            python -m pytest modin/experimental/pandas/test/test_io_exp.py
       - run: mpiexec -n 1 python -m pytest modin/experimental/sql/test/test_sql.py
       - run: mpiexec -n 1 python -m pytest modin/test/interchange/dataframe_protocol/test_general.py
       - run: mpiexec -n 1 python -m pytest modin/test/interchange/dataframe_protocol/pandas/test_protocol.py
@@ -387,7 +388,7 @@ jobs:
       - uses: ./.github/actions/upload-coverage
 
   test-all:
-    needs: [lint-flake8, lint-black, execution-filter]
+    needs: [lint-flake8, execution-filter]
     strategy:
       matrix:
         os:
@@ -521,7 +522,7 @@ jobs:
         if: matrix.os == 'windows'
 
   test-sanity:
-    needs: [lint-flake8, lint-black, execution-filter]
+    needs: [lint-flake8, execution-filter]
     if: github.event_name == 'pull_request'
     strategy:
       matrix:
@@ -560,7 +561,7 @@ jobs:
       - uses: actions/checkout@v3
       - uses: ./.github/actions/mamba-env
         with:
-          environment-file: ${{ matrix.execution.name == 'unidist' && 'requirements/env_unidist.yml' || 'environment-dev.yml' }}
+          environment-file: ${{ matrix.os == 'ubuntu' && matrix.execution.name == 'unidist' && 'requirements/env_unidist_linux.yml' || matrix.os == 'windows' && matrix.execution.name == 'unidist' && 'requirements/env_unidist_win.yml' || 'environment-dev.yml' }}
           activate-environment: ${{ matrix.execution.name == 'unidist' && 'modin_on_unidist' || 'modin' }}
           python-version: ${{matrix.python-version}}
       - name: Install HDF5
@@ -584,6 +585,7 @@ jobs:
       - run: MODIN_BENCHMARK_MODE=True ${{ matrix.execution.shell-ex }} modin/pandas/test/internals/test_benchmark_mode.py
       - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/pandas/test/internals/test_repartition.py
       - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/test_partition_api.py
+      - run: ${{ matrix.execution.shell-ex }} modin/pandas/api/extensions/test
       - name: xgboost tests
         run: |
           # TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost
@@ -630,6 +632,15 @@ jobs:
         if: matrix.os != 'windows'
       - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/numpy/test
       - run: ${{ matrix.execution.shell-ex }} -m "not exclude_in_sanity" modin/pandas/test/test_io.py --verbose
+        if: matrix.execution.name != 'unidist'
+      - uses: nick-fields/retry@v2
+        # to avoid issues with non-stable `to_csv` tests for unidist on MPI backend.
+        # for details see: https://github.com/modin-project/modin/pull/6776
+        with:
+          timeout_minutes: 15
+          max_attempts: 3
+          command: conda run --no-capture-output -n modin_on_unidist ${{ matrix.execution.shell-ex }} -m "not exclude_in_sanity" modin/pandas/test/test_io.py --verbose
+        if: matrix.execution.name == 'unidist'
       - run: ${{ matrix.execution.shell-ex }} modin/experimental/pandas/test/test_io_exp.py
       - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/interchange/dataframe_protocol/test_general.py
       - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/interchange/dataframe_protocol/pandas/test_protocol.py
@@ -644,7 +655,7 @@ jobs:
       - uses: ./.github/actions/upload-coverage
 
   test-experimental:
-    needs: [lint-flake8, lint-black]
+    needs: [lint-flake8]
     runs-on: ubuntu-latest
     defaults:
       run:
@@ -672,38 +683,8 @@ jobs:
       - run: python -m pytest modin/pandas/test/test_io.py --verbose
       - uses: ./.github/actions/upload-coverage
 
-  test-pyarrow:
-    needs: [lint-flake8, lint-black]
-    runs-on: ubuntu-latest
-    defaults:
-      run:
-        shell: bash -l {0}
-    strategy:
-      matrix:
-        python-version: ["3.9"]
-    env:
-      MODIN_STORAGE_FORMAT: pyarrow
-      MODIN_EXPERIMENTAL: "True"
-    name: test (pyarrow, python ${{matrix.python-version}})
-    services:
-      moto:
-        image: motoserver/moto
-        ports:
-          - 5000:5000
-        env:
-          AWS_ACCESS_KEY_ID: foobar_key
-          AWS_SECRET_ACCESS_KEY: foobar_secret
-    steps:
-      - uses: actions/checkout@v3
-      - uses: ./.github/actions/mamba-env
-        with:
-          environment-file: environment-dev.yml
-          python-version: ${{matrix.python-version}}
-      - run: sudo apt update && sudo apt install -y libhdf5-dev
-      - run: python -m pytest modin/pandas/test/test_io.py::TestCsv --verbose
-
   test-spreadsheet:
-    needs: [lint-flake8, lint-black]
+    needs: [lint-flake8]
     runs-on: ubuntu-latest
     defaults:
       run: