Skip to content

Commit

Permalink
[KED-1116] Add support for Python 3.8 (#464)
Browse files Browse the repository at this point in the history
  • Loading branch information
Lorena Bălan authored Mar 17, 2020
1 parent 975f7fa commit fe48953
Show file tree
Hide file tree
Showing 14 changed files with 121 additions and 37 deletions.
22 changes: 22 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ executors:
py37:
docker:
- image: quantumblacklabs/kedro_builder:3.7
py38:
docker:
- image: quantumblacklabs/kedro_builder:3.8


commands:
setup_conda:
Expand Down Expand Up @@ -150,6 +154,18 @@ jobs:
executor: py37
steps: [build_docs]

unit_tests_38:
executor: py38
steps: [unit_tests]

linters_38:
executor: py38
steps: [lint]

e2e_tests_38:
executor: py38
steps: [e2e_tests]

pip_compile:
executor: py37
steps: [pip_compile]
Expand Down Expand Up @@ -178,6 +194,9 @@ workflows:
- linters_37
- e2e_tests_37
- docs_37
- unit_tests_38
- linters_38
- e2e_tests_38
- pip_compile
- all_circleci_checks_succeeded:
requires:
Expand All @@ -192,4 +211,7 @@ workflows:
- linters_37
- e2e_tests_37
- docs_37
- unit_tests_38
- linters_38
- e2e_tests_38
- pip_compile
2 changes: 2 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[run]
omit = tests/extras/datasets/spark/*
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
| Theme | Status |
|------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Latest Release | [![PyPI version](https://badge.fury.io/py/kedro.svg)](https://pypi.org/project/kedro/) |
| Python Version | [![Python Version](https://img.shields.io/badge/python-3.5%20%7C%203.6%20%7C%203.7-blue.svg)](https://pypi.org/project/kedro/) |
| Python Version | [![Python Version](https://img.shields.io/badge/python-3.5%20%7C%203.6%20%7C%203.7%20%7C%203.8-blue.svg)](https://pypi.org/project/kedro/) |
| `master` Branch Build | [![CircleCI](https://circleci.com/gh/quantumblacklabs/kedro/tree/master.svg?style=shield)](https://circleci.com/gh/quantumblacklabs/kedro/tree/master) |
| `develop` Branch Build | [![CircleCI](https://circleci.com/gh/quantumblacklabs/kedro/tree/develop.svg?style=shield)](https://circleci.com/gh/quantumblacklabs/kedro/tree/develop) |
| Documentation Build | [![Documentation](https://readthedocs.org/projects/kedro/badge/?version=latest)](https://kedro.readthedocs.io/) |
Expand Down
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## Major features and improvements
* Added new CLI command `kedro catalog list`.
* Added a `GeoJSONDataSet`dataset in `kedro.extras.datasets.geopandas` for working with geospatial data that uses [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) to communicate with the underlying filesystem.
* Enabled Python 3.8 compatibility. _Please note that a Spark workflow may be unreliable for this Python version as `pyspark` is not fully-compatible with 3.8 yet._

## Bug fixes and other changes
* Fixed a bug where a new version created mid-run by an external system caused inconsistencies in the load versions used in the current run.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/06_resources/01_faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ You can find a list of Kedro projects in the [`kedro-examples`](https://github.c

## What version of Python does Kedro use?

Kedro is built for Python 3.5, 3.6 and 3.7.
Kedro is built for Python 3.5, 3.6, 3.7 and 3.8.

## How do I upgrade Kedro?

Expand Down
16 changes: 4 additions & 12 deletions kedro/template/{{ cookiecutter.repo_name }}/kedro_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,9 +211,7 @@ def cli():
"--runner", "-r", type=str, default=None, multiple=False, help=RUNNER_ARG_HELP
)
@click.option("--parallel", "-p", is_flag=True, multiple=False, help=PARALLEL_ARG_HELP)
@click.option(
"--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP,
)
@click.option("--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP)
@click.option("--tag", "-t", type=str, multiple=True, help=TAG_ARG_HELP)
@click.option(
"--load-version",
Expand Down Expand Up @@ -473,9 +471,7 @@ def jupyter():
"--all-kernels", is_flag=True, default=False, help=JUPYTER_ALL_KERNELS_HELP
)
@click.option("--idle-timeout", type=int, default=30, help=JUPYTER_IDLE_TIMEOUT_HELP)
@click.option(
"--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP,
)
@click.option("--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP)
def jupyter_notebook(ip, all_kernels, env, idle_timeout, args):
"""Open Jupyter Notebook with project specific variables loaded."""
if "-h" not in args and "--help" not in args:
Expand All @@ -495,9 +491,7 @@ def jupyter_notebook(ip, all_kernels, env, idle_timeout, args):
"--all-kernels", is_flag=True, default=False, help=JUPYTER_ALL_KERNELS_HELP
)
@click.option("--idle-timeout", type=int, default=30, help=JUPYTER_IDLE_TIMEOUT_HELP)
@click.option(
"--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP,
)
@click.option("--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP)
def jupyter_lab(ip, all_kernels, env, idle_timeout, args):
"""Open Jupyter Lab with project specific variables loaded."""
if "-h" not in args and "--help" not in args:
Expand Down Expand Up @@ -608,9 +602,7 @@ def catalog():


@catalog.command("list")
@click.option(
"--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP,
)
@click.option("--env", "-e", type=str, default=None, multiple=False, help=ENV_ARG_HELP)
@click.option(
"--pipeline", type=str, default="", help=PIPELINE_ARG_HELP, callback=_split_string
)
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
long_description=readme,
long_description_content_type="text/markdown",
url="https://github.com/quantumblacklabs/kedro",
python_requires=">=3.5, <3.8",
python_requires=">=3.5, <3.9",
packages=find_packages(exclude=["docs*", "tests*", "tools*", "features*"]),
include_package_data=True,
tests_require=test_requires,
Expand All @@ -137,6 +137,7 @@
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
],
extras_require=extras_require,
)
4 changes: 2 additions & 2 deletions test_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ gcsfs>=0.3.0, <1.0
geopandas>=0.6.0, <1.0
hdfs>=2.5.8, <3.0
import-linter==1.0; python_version >= '3.6'
joblib==0.12.3
joblib>=0.14
jupyter_client>=5.1.0, <6.0
matplotlib>=3.0.3, <4.0
memory_profiler>=0.50.0, <1.0
Expand All @@ -26,7 +26,7 @@ pre-commit>=1.17.0, <2.0.0
psutil==5.6.6
pyarrow>=0.12.0, <1.0.0
pylint>=2.4.4, <3.0
pyspark>=2.2.0, <3.0
pyspark>=2.2.0, <3.0; python_version < '3.8'
pytest-cov>=2.5, <3.0
pytest-mock>=1.7.1,<2.0
pytest>=3.9, <4.0
Expand Down
15 changes: 13 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,15 @@
import sys
from subprocess import Popen

import mock
import pytest
from pyspark import SparkContext
from pyspark.sql import SparkSession

if sys.version_info < (3, 8):
from pyspark import SparkContext
from pyspark.sql import SparkSession
else:
SparkContext = mock.Mock()
SparkSession = mock.Mock()

the_real_getOrCreate = None

Expand Down Expand Up @@ -93,3 +99,8 @@ def preserve_system_context():

if os.getcwd() != old_cwd:
os.chdir(old_cwd)


skip_if_py38 = pytest.mark.skipif(
sys.version_info >= (3, 8), reason="Dependency not compatible with Python 3.8 yet"
)
4 changes: 2 additions & 2 deletions tests/context/test_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,8 +376,8 @@ def test_pipelines(self, dummy_context):
assert len(dummy_context.pipelines["__default__"].nodes) == 4


@pytest.mark.usefixtures("config_dir") # pylint: disable=too-many-public-methods
class TestKedroContextRun:
@pytest.mark.usefixtures("config_dir")
class TestKedroContextRun: # pylint: disable=too-many-public-methods
def test_run_output(self, dummy_context, dummy_dataframe):
dummy_context.catalog.save("cars", dummy_dataframe)
outputs = dummy_context.run()
Expand Down
20 changes: 17 additions & 3 deletions tests/extras/datasets/spark/test_memory_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,22 @@
# limitations under the License.

# pylint: disable=no-name-in-module
import sys

import mock
import pytest
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

from kedro.io import MemoryDataSet
from tests.conftest import skip_if_py38

if sys.version_info < (3, 8):
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.sql.functions import col, when
else:
SparkSession = mock.ANY
col = when = mock.ANY
SparkDataFrame = mock.ANY


def _update_spark_df(data, idx, jdx, value):
Expand Down Expand Up @@ -64,13 +74,15 @@ def memory_data_set(spark_data_frame):
return MemoryDataSet(data=spark_data_frame)


@skip_if_py38
def test_load_modify_original_data(memory_data_set, spark_data_frame):
"""Check that the data set object is not updated when the original
SparkDataFrame is changed."""
spark_data_frame = _update_spark_df(spark_data_frame, 1, 1, -5)
assert not _check_equals(memory_data_set.load(), spark_data_frame)


@skip_if_py38
def test_save_modify_original_data(spark_data_frame):
"""Check that the data set object is not updated when the original
SparkDataFrame is changed."""
Expand All @@ -81,6 +93,7 @@ def test_save_modify_original_data(spark_data_frame):
assert not _check_equals(memory_data_set.load(), spark_data_frame)


@skip_if_py38
def test_load_returns_same_spark_object(memory_data_set, spark_data_frame):
"""Test that consecutive loads point to the same object in case of
a SparkDataFrame"""
Expand All @@ -91,6 +104,7 @@ def test_load_returns_same_spark_object(memory_data_set, spark_data_frame):
assert loaded_data is reloaded_data


@skip_if_py38
def test_str_representation(memory_data_set):
"""Test string representation of the data set"""
assert "MemoryDataSet(data=<DataFrame>)" in str(memory_data_set)
28 changes: 22 additions & 6 deletions tests/extras/datasets/spark/test_spark_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,35 @@
#
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import tempfile
from pathlib import Path

import mock
import pandas as pd
import pytest
from pyspark.sql import SparkSession
from pyspark.sql.functions import col # pylint: disable=no-name-in-module
from pyspark.sql.types import IntegerType, StringType, StructField, StructType
from pyspark.sql.utils import AnalysisException

from kedro.extras.datasets.pandas import CSVDataSet, ParquetDataSet
from kedro.extras.datasets.pickle import PickleDataSet
from kedro.extras.datasets.spark import SparkDataSet
from kedro.io import DataCatalog, DataSetError, Version
from kedro.io.core import generate_timestamp
from kedro.pipeline import Pipeline, node
from kedro.runner import ParallelRunner
from tests.conftest import skip_if_py38

if sys.version_info < (3, 8):
from pyspark.sql import SparkSession
from pyspark.sql.functions import col # pylint: disable=no-name-in-module
from pyspark.sql.types import IntegerType, StringType, StructField, StructType
from pyspark.sql.utils import AnalysisException
from kedro.extras.datasets.spark import SparkDataSet
else:
SparkSession = mock.ANY
SparkDataSet = mock.ANY
col = mock.ANY
IntegerType = StringType = StructField = StructType = mock.ANY
AnalysisException = mock.ANY


FOLDER_NAME = "fake_folder"
FILENAME = "test.parquet"
Expand Down Expand Up @@ -143,6 +154,7 @@ def spark_out(tmp_path):
return SparkDataSet(filepath=str(tmp_path / "output"))


@skip_if_py38
class TestSparkDataSet:
def test_load_parquet(self, tmp_path, sample_pandas_df):
temp_path = str(tmp_path / "data")
Expand Down Expand Up @@ -334,6 +346,7 @@ def test_parallel_runner_with_memory_dataset(
runner.run(pipeline, catalog)


@skip_if_py38
class TestSparkDataSetVersionedLocal:
def test_no_version(self, versioned_dataset_local):
pattern = r"Did not find any versions for SparkDataSet\(.+\)"
Expand Down Expand Up @@ -399,6 +412,7 @@ def test_prevent_overwrite(self, tmp_path, version, sample_spark_df):
versioned_local.save(sample_spark_df)


@skip_if_py38
class TestSparkDataSetVersionedDBFS:
def test_load_latest( # pylint: disable=too-many-arguments
self, mocker, versioned_dataset_dbfs, version, tmp_path, sample_spark_df
Expand Down Expand Up @@ -457,6 +471,7 @@ def test_exists( # pylint: disable=too-many-arguments
assert mocked_glob.call_args_list == expected_calls


@skip_if_py38
class TestSparkDataSetVersionedS3:
def test_no_version(self, versioned_dataset_s3):
pattern = r"Did not find any versions for SparkDataSet\(.+\)"
Expand Down Expand Up @@ -569,6 +584,7 @@ def test_repr(self, versioned_dataset_s3, version):
assert "version=" not in str(dataset_s3)


@skip_if_py38
class TestSparkDataSetVersionedHdfs:
def test_no_version(self, mocker, version):
hdfs_walk = mocker.patch(
Expand Down
21 changes: 16 additions & 5 deletions tests/extras/datasets/spark/test_spark_hive_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,28 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import sys
from pathlib import Path
from tempfile import TemporaryDirectory

import mock
import pytest
from psutil import Popen
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructField, StructType

from kedro.extras.datasets.spark import SparkHiveDataSet
from kedro.io import DataSetError
from tests.conftest import UseTheSparkSessionFixtureOrMock
from tests.conftest import UseTheSparkSessionFixtureOrMock, skip_if_py38

if sys.version_info < (3, 8):
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructField, StructType
from kedro.extras.datasets.spark import SparkHiveDataSet
else:
SparkContext = SparkSession = mock.ANY
IntegerType = StringType = StructField = StructType = mock.ANY
AnalysisException = mock.ANY
SparkHiveDataSet = mock.ANY


TESTSPARKDIR = "test_spark_dir"

Expand Down Expand Up @@ -156,6 +166,7 @@ def _generate_spark_df_upsert_expected():
return SparkSession.builder.getOrCreate().createDataFrame(data, schema).coalesce(1)


@skip_if_py38
class TestSparkHiveDataSet:
def test_cant_pickle(self):
import pickle # pylint: disable=import-outside-toplevel
Expand Down
Loading

0 comments on commit fe48953

Please sign in to comment.