[KED-1116] Add support for Python 3.8 (#464)

kedro-org · Mar 17, 2020 · fe48953 · fe48953
1 parent 975f7fa
commit fe48953
Show file tree

Hide file tree

Showing 14 changed files with 121 additions and 37 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -10,6 +10,10 @@ executors:
   py37:
     docker:
       - image: quantumblacklabs/kedro_builder:3.7
+  py38:
+    docker:
+      - image: quantumblacklabs/kedro_builder:3.8
+
 
 commands:
   setup_conda:
@@ -150,6 +154,18 @@ jobs:
     executor: py37
     steps: [build_docs]
 
+  unit_tests_38:
+    executor: py38
+    steps: [unit_tests]
+
+  linters_38:
+    executor: py38
+    steps: [lint]
+
+  e2e_tests_38:
+    executor: py38
+    steps: [e2e_tests]
+
   pip_compile:
     executor: py37
     steps: [pip_compile]
@@ -178,6 +194,9 @@ workflows:
       - linters_37
       - e2e_tests_37
       - docs_37
+      - unit_tests_38
+      - linters_38
+      - e2e_tests_38
       - pip_compile
       - all_circleci_checks_succeeded:
           requires:
@@ -192,4 +211,7 @@ workflows:
             - linters_37
             - e2e_tests_37
             - docs_37
+            - unit_tests_38
+            - linters_38
+            - e2e_tests_38
             - pip_compile
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,2 @@
+[run]
+omit = tests/extras/datasets/spark/*
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 | Theme | Status |
 |------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | Latest Release | [![PyPI version](https://badge.fury.io/py/kedro.svg)](https://pypi.org/project/kedro/) |
-| Python Version | [![Python Version](https://img.shields.io/badge/python-3.5%20%7C%203.6%20%7C%203.7-blue.svg)](https://pypi.org/project/kedro/) |
+| Python Version | [![Python Version](https://img.shields.io/badge/python-3.5%20%7C%203.6%20%7C%203.7%20%7C%203.8-blue.svg)](https://pypi.org/project/kedro/) |
 | `master` Branch Build | [![CircleCI](https://circleci.com/gh/quantumblacklabs/kedro/tree/master.svg?style=shield)](https://circleci.com/gh/quantumblacklabs/kedro/tree/master) |
 | `develop` Branch Build | [![CircleCI](https://circleci.com/gh/quantumblacklabs/kedro/tree/develop.svg?style=shield)](https://circleci.com/gh/quantumblacklabs/kedro/tree/develop) |
 | Documentation Build | [![Documentation](https://readthedocs.org/projects/kedro/badge/?version=latest)](https://kedro.readthedocs.io/) |

diff --git a/RELEASE.md b/RELEASE.md
@@ -3,6 +3,7 @@
 ## Major features and improvements
 * Added new CLI command `kedro catalog list`.
 * Added a `GeoJSONDataSet`dataset in `kedro.extras.datasets.geopandas` for working with geospatial data that uses [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) to communicate with the underlying filesystem.
+* Enabled Python 3.8 compatibility. _Please note that a Spark workflow may be unreliable for this Python version as `pyspark` is not fully-compatible with 3.8 yet._
 
 ## Bug fixes and other changes
 * Fixed a bug where a new version created mid-run by an external system caused inconsistencies in the load versions used in the current run.

diff --git a/docs/source/06_resources/01_faq.md b/docs/source/06_resources/01_faq.md
@@ -137,7 +137,7 @@ You can find a list of Kedro projects in the [`kedro-examples`](https://github.c
 
 ## What version of Python does Kedro use?
 
-Kedro is built for Python 3.5, 3.6 and 3.7.
+Kedro is built for Python 3.5, 3.6, 3.7 and 3.8.
 
 ## How do I upgrade Kedro?
 

diff --git a/kedro/template/{{ cookiecutter.repo_name }}/kedro_cli.py b/kedro/template/{{ cookiecutter.repo_name }}/kedro_cli.py
@@ -211,9 +211,7 @@ def cli():
     "--runner", "-r", type=str, default=None, multiple=False, help=RUNNER_ARG_HELP
 )
 @click.option("--parallel", "-p", is_flag=True, multiple=False, help=PARALLEL_ARG_HELP)
-@click.option(
-    "--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP,
-)
+@click.option("--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP)
 @click.option("--tag", "-t", type=str, multiple=True, help=TAG_ARG_HELP)
 @click.option(
     "--load-version",
@@ -473,9 +471,7 @@ def jupyter():
     "--all-kernels", is_flag=True, default=False, help=JUPYTER_ALL_KERNELS_HELP
 )
 @click.option("--idle-timeout", type=int, default=30, help=JUPYTER_IDLE_TIMEOUT_HELP)
-@click.option(
-    "--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP,
-)
+@click.option("--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP)
 def jupyter_notebook(ip, all_kernels, env, idle_timeout, args):
     """Open Jupyter Notebook with project specific variables loaded."""
     if "-h" not in args and "--help" not in args:
@@ -495,9 +491,7 @@ def jupyter_notebook(ip, all_kernels, env, idle_timeout, args):
     "--all-kernels", is_flag=True, default=False, help=JUPYTER_ALL_KERNELS_HELP
 )
 @click.option("--idle-timeout", type=int, default=30, help=JUPYTER_IDLE_TIMEOUT_HELP)
-@click.option(
-    "--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP,
-)
+@click.option("--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP)
 def jupyter_lab(ip, all_kernels, env, idle_timeout, args):
     """Open Jupyter Lab with project specific variables loaded."""
     if "-h" not in args and "--help" not in args:
@@ -608,9 +602,7 @@ def catalog():
 
 
 @catalog.command("list")
-@click.option(
-    "--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP,
-)
+@click.option("--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP)
 @click.option(
     "--pipeline", type=str, default="", help=PIPELINE_ARG_HELP, callback=_split_string
 )

diff --git a/setup.py b/setup.py
@@ -122,7 +122,7 @@
     long_description=readme,
     long_description_content_type="text/markdown",
     url="https://github.com/quantumblacklabs/kedro",
-    python_requires=">=3.5, <3.8",
+    python_requires=">=3.5, <3.9",
     packages=find_packages(exclude=["docs*", "tests*", "tools*", "features*"]),
     include_package_data=True,
     tests_require=test_requires,
@@ -137,6 +137,7 @@
         "Programming Language :: Python :: 3.5",
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
     ],
     extras_require=extras_require,
 )
diff --git a/test_requirements.txt b/test_requirements.txt
@@ -11,7 +11,7 @@ gcsfs>=0.3.0, <1.0
 geopandas>=0.6.0, <1.0
 hdfs>=2.5.8, <3.0
 import-linter==1.0; python_version >= '3.6'
-joblib==0.12.3
+joblib>=0.14
 jupyter_client>=5.1.0, <6.0
 matplotlib>=3.0.3, <4.0
 memory_profiler>=0.50.0, <1.0
@@ -26,7 +26,7 @@ pre-commit>=1.17.0, <2.0.0
 psutil==5.6.6
 pyarrow>=0.12.0, <1.0.0
 pylint>=2.4.4, <3.0
-pyspark>=2.2.0, <3.0
+pyspark>=2.2.0, <3.0; python_version < '3.8'
 pytest-cov>=2.5, <3.0
 pytest-mock>=1.7.1,<2.0
 pytest>=3.9, <4.0

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -37,9 +37,15 @@
 import sys
 from subprocess import Popen
 
+import mock
 import pytest
-from pyspark import SparkContext
-from pyspark.sql import SparkSession
+
+if sys.version_info < (3, 8):
+    from pyspark import SparkContext
+    from pyspark.sql import SparkSession
+else:
+    SparkContext = mock.Mock()
+    SparkSession = mock.Mock()
 
 the_real_getOrCreate = None
 
@@ -93,3 +99,8 @@ def preserve_system_context():
 
     if os.getcwd() != old_cwd:
         os.chdir(old_cwd)
+
+
+skip_if_py38 = pytest.mark.skipif(
+    sys.version_info >= (3, 8), reason="Dependency not compatible with Python 3.8 yet"
+)
diff --git a/tests/context/test_context.py b/tests/context/test_context.py
@@ -376,8 +376,8 @@ def test_pipelines(self, dummy_context):
         assert len(dummy_context.pipelines["__default__"].nodes) == 4
 
 
-@pytest.mark.usefixtures("config_dir")  # pylint: disable=too-many-public-methods
-class TestKedroContextRun:
+@pytest.mark.usefixtures("config_dir")
+class TestKedroContextRun:  # pylint: disable=too-many-public-methods
     def test_run_output(self, dummy_context, dummy_dataframe):
         dummy_context.catalog.save("cars", dummy_dataframe)
         outputs = dummy_context.run()

diff --git a/tests/extras/datasets/spark/test_memory_dataset.py b/tests/extras/datasets/spark/test_memory_dataset.py
@@ -27,12 +27,22 @@
 # limitations under the License.
 
 # pylint: disable=no-name-in-module
+import sys
+
+import mock
 import pytest
-from pyspark.sql import DataFrame as SparkDataFrame
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import col, when
 
 from kedro.io import MemoryDataSet
+from tests.conftest import skip_if_py38
+
+if sys.version_info < (3, 8):
+    from pyspark.sql import SparkSession
+    from pyspark.sql import DataFrame as SparkDataFrame
+    from pyspark.sql.functions import col, when
+else:
+    SparkSession = mock.ANY
+    col = when = mock.ANY
+    SparkDataFrame = mock.ANY
 
 
 def _update_spark_df(data, idx, jdx, value):
@@ -64,13 +74,15 @@ def memory_data_set(spark_data_frame):
     return MemoryDataSet(data=spark_data_frame)
 
 
+@skip_if_py38
 def test_load_modify_original_data(memory_data_set, spark_data_frame):
     """Check that the data set object is not updated when the original
     SparkDataFrame is changed."""
     spark_data_frame = _update_spark_df(spark_data_frame, 1, 1, -5)
     assert not _check_equals(memory_data_set.load(), spark_data_frame)
 
 
+@skip_if_py38
 def test_save_modify_original_data(spark_data_frame):
     """Check that the data set object is not updated when the original
     SparkDataFrame is changed."""
@@ -81,6 +93,7 @@ def test_save_modify_original_data(spark_data_frame):
     assert not _check_equals(memory_data_set.load(), spark_data_frame)
 
 
+@skip_if_py38
 def test_load_returns_same_spark_object(memory_data_set, spark_data_frame):
     """Test that consecutive loads point to the same object in case of
     a SparkDataFrame"""
@@ -91,6 +104,7 @@ def test_load_returns_same_spark_object(memory_data_set, spark_data_frame):
     assert loaded_data is reloaded_data
 
 
+@skip_if_py38
 def test_str_representation(memory_data_set):
     """Test string representation of the data set"""
     assert "MemoryDataSet(data=<DataFrame>)" in str(memory_data_set)
diff --git a/tests/extras/datasets/spark/test_spark_dataset.py b/tests/extras/datasets/spark/test_spark_dataset.py
@@ -25,24 +25,35 @@
 #
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import sys
 import tempfile
 from pathlib import Path
 
+import mock
 import pandas as pd
 import pytest
-from pyspark.sql import SparkSession
-from pyspark.sql.functions import col  # pylint: disable=no-name-in-module
-from pyspark.sql.types import IntegerType, StringType, StructField, StructType
-from pyspark.sql.utils import AnalysisException
 
 from kedro.extras.datasets.pandas import CSVDataSet, ParquetDataSet
 from kedro.extras.datasets.pickle import PickleDataSet
-from kedro.extras.datasets.spark import SparkDataSet
 from kedro.io import DataCatalog, DataSetError, Version
 from kedro.io.core import generate_timestamp
 from kedro.pipeline import Pipeline, node
 from kedro.runner import ParallelRunner
+from tests.conftest import skip_if_py38
+
+if sys.version_info < (3, 8):
+    from pyspark.sql import SparkSession
+    from pyspark.sql.functions import col  # pylint: disable=no-name-in-module
+    from pyspark.sql.types import IntegerType, StringType, StructField, StructType
+    from pyspark.sql.utils import AnalysisException
+    from kedro.extras.datasets.spark import SparkDataSet
+else:
+    SparkSession = mock.ANY
+    SparkDataSet = mock.ANY
+    col = mock.ANY
+    IntegerType = StringType = StructField = StructType = mock.ANY
+    AnalysisException = mock.ANY
+
 
 FOLDER_NAME = "fake_folder"
 FILENAME = "test.parquet"
@@ -143,6 +154,7 @@ def spark_out(tmp_path):
     return SparkDataSet(filepath=str(tmp_path / "output"))
 
 
+@skip_if_py38
 class TestSparkDataSet:
     def test_load_parquet(self, tmp_path, sample_pandas_df):
         temp_path = str(tmp_path / "data")
@@ -334,6 +346,7 @@ def test_parallel_runner_with_memory_dataset(
             runner.run(pipeline, catalog)
 
 
+@skip_if_py38
 class TestSparkDataSetVersionedLocal:
     def test_no_version(self, versioned_dataset_local):
         pattern = r"Did not find any versions for SparkDataSet\(.+\)"
@@ -399,6 +412,7 @@ def test_prevent_overwrite(self, tmp_path, version, sample_spark_df):
             versioned_local.save(sample_spark_df)
 
 
+@skip_if_py38
 class TestSparkDataSetVersionedDBFS:
     def test_load_latest(  # pylint: disable=too-many-arguments
         self, mocker, versioned_dataset_dbfs, version, tmp_path, sample_spark_df
@@ -457,6 +471,7 @@ def test_exists(  # pylint: disable=too-many-arguments
         assert mocked_glob.call_args_list == expected_calls
 
 
+@skip_if_py38
 class TestSparkDataSetVersionedS3:
     def test_no_version(self, versioned_dataset_s3):
         pattern = r"Did not find any versions for SparkDataSet\(.+\)"
@@ -569,6 +584,7 @@ def test_repr(self, versioned_dataset_s3, version):
         assert "version=" not in str(dataset_s3)
 
 
+@skip_if_py38
 class TestSparkDataSetVersionedHdfs:
     def test_no_version(self, mocker, version):
         hdfs_walk = mocker.patch(

diff --git a/tests/extras/datasets/spark/test_spark_hive_dataset.py b/tests/extras/datasets/spark/test_spark_hive_dataset.py
@@ -26,18 +26,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import gc
+import sys
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
+import mock
 import pytest
 from psutil import Popen
-from pyspark import SparkContext
-from pyspark.sql import SparkSession
-from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 
-from kedro.extras.datasets.spark import SparkHiveDataSet
 from kedro.io import DataSetError
-from tests.conftest import UseTheSparkSessionFixtureOrMock
+from tests.conftest import UseTheSparkSessionFixtureOrMock, skip_if_py38
+
+if sys.version_info < (3, 8):
+    from pyspark import SparkContext
+    from pyspark.sql import SparkSession
+    from pyspark.sql.types import IntegerType, StringType, StructField, StructType
+    from kedro.extras.datasets.spark import SparkHiveDataSet
+else:
+    SparkContext = SparkSession = mock.ANY
+    IntegerType = StringType = StructField = StructType = mock.ANY
+    AnalysisException = mock.ANY
+    SparkHiveDataSet = mock.ANY
+
 
 TESTSPARKDIR = "test_spark_dir"
 
@@ -156,6 +166,7 @@ def _generate_spark_df_upsert_expected():
     return SparkSession.builder.getOrCreate().createDataFrame(data, schema).coalesce(1)
 
 
+@skip_if_py38
 class TestSparkHiveDataSet:
     def test_cant_pickle(self):
         import pickle  # pylint: disable=import-outside-toplevel