From 9b7f7dcb59611afd60c376d7a5b0de16f55f901d Mon Sep 17 00:00:00 2001 From: itholic Date: Fri, 26 Aug 2022 16:56:19 +0900 Subject: [PATCH 1/5] [SPARK-40229][PS][TEST] Re-enable excel I/O test for pandas API on Spark. --- .github/workflows/build_and_test.yml | 2 +- python/pyspark/pandas/tests/test_dataframe_conversion.py | 1 - python/pyspark/pandas/tests/test_dataframe_spark_io.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3a9157010e9b0..deb1ae8fc8581 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -242,7 +242,7 @@ jobs: - name: Install Python packages (Python 3.8) if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) run: | - python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting + python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting openpyxl python3.8 -m pip list # Run the tests. - name: Run tests diff --git a/python/pyspark/pandas/tests/test_dataframe_conversion.py b/python/pyspark/pandas/tests/test_dataframe_conversion.py index 0582f5db3721c..4e4c9ac2e7d9b 100644 --- a/python/pyspark/pandas/tests/test_dataframe_conversion.py +++ b/python/pyspark/pandas/tests/test_dataframe_conversion.py @@ -90,7 +90,6 @@ def get_excel_dfs(pandas_on_spark_location, pandas_location): "expected": pd.read_excel(pandas_location, index_col=0), } - @unittest.skip("openpyxl") def test_to_excel(self): with self.temp_dir() as dirpath: pandas_location = dirpath + "/" + "output1.xlsx" diff --git a/python/pyspark/pandas/tests/test_dataframe_spark_io.py b/python/pyspark/pandas/tests/test_dataframe_spark_io.py index d63ee659d3dec..d534e4c6684da 100644 --- a/python/pyspark/pandas/tests/test_dataframe_spark_io.py +++ b/python/pyspark/pandas/tests/test_dataframe_spark_io.py @@ -247,7 +247,6 @@ def test_spark_io(self): expected_idx.sort_values(by="f").to_spark().toPandas(), ) - @unittest.skip("openpyxl") def test_read_excel(self): with self.temp_dir() as tmp: From d149f5ddc5e1840ea4f8b13856c4af6b3c63daec Mon Sep 17 00:00:00 2001 From: itholic Date: Fri, 26 Aug 2022 17:20:32 +0900 Subject: [PATCH 2/5] Install package for Python 3.9 --- .github/workflows/build_and_test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index deb1ae8fc8581..ff61dfcdd15ee 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -383,6 +383,10 @@ jobs: uses: actions/setup-java@v1 with: java-version: ${{ matrix.java }} + - name: Install Python packages (Python 3.9, PyPy3) + run: | + # To test excel I/O for pandas API on Spark. + python3.9 -m pip install openpyxl - name: List Python packages (Python 3.9, PyPy3) run: | python3.9 -m pip list From 83d9f512a5c0e24e692a2d947ad88202a647d7db Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 31 Aug 2022 17:21:21 +0900 Subject: [PATCH 3/5] Disable the read_excel test --- .github/workflows/build_and_test.yml | 2 +- python/pyspark/pandas/tests/test_dataframe_spark_io.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index ff61dfcdd15ee..80fad27b97582 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -242,7 +242,7 @@ jobs: - name: Install Python packages (Python 3.8) if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) run: | - python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting openpyxl + python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting python3.8 -m pip list # Run the tests. - name: Run tests diff --git a/python/pyspark/pandas/tests/test_dataframe_spark_io.py b/python/pyspark/pandas/tests/test_dataframe_spark_io.py index d534e4c6684da..d63ee659d3dec 100644 --- a/python/pyspark/pandas/tests/test_dataframe_spark_io.py +++ b/python/pyspark/pandas/tests/test_dataframe_spark_io.py @@ -247,6 +247,7 @@ def test_spark_io(self): expected_idx.sort_values(by="f").to_spark().toPandas(), ) + @unittest.skip("openpyxl") def test_read_excel(self): with self.temp_dir() as tmp: From af72f08f05a980088b7aa8d0954c11aa25de61b7 Mon Sep 17 00:00:00 2001 From: itholic Date: Tue, 6 Sep 2022 08:18:46 +0900 Subject: [PATCH 4/5] Added TODO --- python/pyspark/pandas/tests/test_dataframe_spark_io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/pandas/tests/test_dataframe_spark_io.py b/python/pyspark/pandas/tests/test_dataframe_spark_io.py index d63ee659d3dec..6c9cc13e8250f 100644 --- a/python/pyspark/pandas/tests/test_dataframe_spark_io.py +++ b/python/pyspark/pandas/tests/test_dataframe_spark_io.py @@ -247,6 +247,7 @@ def test_spark_io(self): expected_idx.sort_values(by="f").to_spark().toPandas(), ) + # TODO(SPARK-40353): re-enabling the `test_read_excel`. @unittest.skip("openpyxl") def test_read_excel(self): with self.temp_dir() as tmp: From cb2b95500fa6ed39163a6e889b1eb9021f708287 Mon Sep 17 00:00:00 2001 From: itholic Date: Tue, 6 Sep 2022 10:39:31 +0900 Subject: [PATCH 5/5] Move install to Dockerfile --- .github/workflows/build_and_test.yml | 4 ---- dev/infra/Dockerfile | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 80fad27b97582..3a9157010e9b0 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -383,10 +383,6 @@ jobs: uses: actions/setup-java@v1 with: java-version: ${{ matrix.java }} - - name: Install Python packages (Python 3.9, PyPy3) - run: | - # To test excel I/O for pandas API on Spark. - python3.9 -m pip install openpyxl - name: List Python packages (Python 3.9, PyPy3) run: | python3.9 -m pip list diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 1a7e6451985e8..14e5d5d1c9fad 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -32,7 +32,7 @@ RUN $APT_INSTALL software-properties-common git libxml2-dev pkg-config curl wget RUN update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 -RUN python3.9 -m pip install numpy pyarrow 'pandas<=1.4.3' scipy unittest-xml-reporting plotly>=4.8 sklearn 'mlflow>=1.0' coverage matplotlib +RUN python3.9 -m pip install numpy pyarrow 'pandas<=1.4.3' scipy unittest-xml-reporting plotly>=4.8 sklearn 'mlflow>=1.0' coverage matplotlib openpyxl RUN add-apt-repository ppa:pypy/ppa RUN apt update