[KED-1796] Fixed versioning on Windows (#673)

kedro-org · Jun 26, 2020 · 390c02f · 390c02f
1 parent d44a318
commit 390c02f
Show file tree

Hide file tree

Showing 45 changed files with 245 additions and 191 deletions.
diff --git a/RELEASE.md b/RELEASE.md
@@ -17,6 +17,7 @@
 ## Bug fixes and other changes
 * Removed `/src/nodes` directory from the project template and made `kedro jupyter convert` create it on the fly if necessary.
 * Fixed a bug in `MatplotlibWriter` which prevented saving lists and dictionaries of plots locally on Windows.
+* Fixed broken versioning for Windows paths.
 * Fixed `DataSet` string representation for falsy values.
 * Improved the error message when duplicate nodes are passed to the `Pipeline` initializer.
 

diff --git a/docs/source/04_user_guide/04_data_catalog.md b/docs/source/04_user_guide/04_data_catalog.md
@@ -28,6 +28,7 @@ The are two ways of defining a Data Catalog through the use of YAML configuratio
 ## Specifying the location of the dataset
 
 Kedro relies on [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) for reading and saving data from a variety of data stores including local file systems, network file systems, cloud object stores, and Hadoop. When specifying a storage location in `filepath:`, a URL should be provided using the general form `protocol://path/to/data`.  If no protocol is provided, the local file system is assumed (same as ``file://``).
+> *Note:* all the `filepath`s will be automatically converted to POSIX format but it is recommended to use POSIX format by default (e.g. Windows path: `C:/users/kedro/project/data/01_raw/bikes.csv`).
 
 The following prepends are available:
 - **Local or Network File System**: `file://` - the local file system is default in the absence of any protocol, it also permits relative paths.

diff --git a/kedro/context/context.py b/kedro/context/context.py
@@ -99,15 +99,18 @@ def _is_relative_path(path_string: str) -> bool:
     return True
 
 
-def _expand_path(project_path: Path, conf_dictionary: Dict[str, Any]) -> Dict[str, Any]:
-    """Turn all relative paths inside ``conf_dictionary`` into absolute paths by appending them to
-    ``project_path``. This is a hack to make sure that we don't have to change user's
-    working directory for logging and datasets to work. It is important for non-standard workflows
-    such as IPython notebook where users don't go through `kedro run` or `run.py` entrypoints.
+def _convert_paths_to_absolute_posix(
+    project_path: Path, conf_dictionary: Dict[str, Any]
+) -> Dict[str, Any]:
+    """Turn all relative paths inside ``conf_dictionary`` into absolute paths by appending them
+    to ``project_path`` and convert absolute Windows paths to POSIX format. This is a hack to
+    make sure that we don't have to change user's working directory for logging and datasets to
+    work. It is important for non-standard workflows such as IPython notebook where users don't go
+    through `kedro run` or `run.py` entrypoints.
 
     Example:
     ::
-        >>> conf = _expand_path(
+        >>> conf = _convert_paths_to_absolute_posix(
         >>>     project_path=Path("/path/to/my/project"),
         >>>     conf_dictionary={
         >>>         "handlers": {
@@ -140,7 +143,9 @@ def _expand_path(project_path: Path, conf_dictionary: Dict[str, Any]) -> Dict[st
 
         # if the conf_value is another dictionary, absolutify its paths first.
         if isinstance(conf_value, dict):
-            conf_dictionary[conf_key] = _expand_path(project_path, conf_value)
+            conf_dictionary[conf_key] = _convert_paths_to_absolute_posix(
+                project_path, conf_value
+            )
             continue
 
         # if the conf_value is not a dictionary nor a string, skip
@@ -152,8 +157,12 @@ def _expand_path(project_path: Path, conf_dictionary: Dict[str, Any]) -> Dict[st
             continue
 
         if _is_relative_path(conf_value):
-            conf_value_absolute_path = str(project_path / conf_value)
+            # Absolute local path should be in POSIX format
+            conf_value_absolute_path = (project_path / conf_value).as_posix()
             conf_dictionary[conf_key] = conf_value_absolute_path
+        elif PureWindowsPath(conf_value).drive:
+            # Convert absolute Windows path to POSIX format
+            conf_dictionary[conf_key] = PureWindowsPath(conf_value).as_posix()
 
     return conf_dictionary
 
@@ -400,7 +409,7 @@ def _get_catalog(
         conf_catalog = self.config_loader.get("catalog*", "catalog*/**", "**/catalog*")
         # turn relative paths in conf_catalog into absolute paths
         # before initializing the catalog
-        conf_catalog = _expand_path(
+        conf_catalog = _convert_paths_to_absolute_posix(
             project_path=self.project_path, conf_dictionary=conf_catalog
         )
         conf_creds = self._get_config_credentials()
@@ -495,7 +504,7 @@ def _setup_logging(self) -> None:
         """Register logging specified in logging directory."""
         conf_logging = self.config_loader.get("logging*", "logging*/**")
         # turn relative paths in logging config into absolute path before initialising loggers
-        conf_logging = _expand_path(
+        conf_logging = _convert_paths_to_absolute_posix(
             project_path=self.project_path, conf_dictionary=conf_logging
         )
         logging.config.dictConfig(conf_logging)

diff --git a/kedro/extras/datasets/biosequence/biosequence_dataset.py b/kedro/extras/datasets/biosequence/biosequence_dataset.py
@@ -82,8 +82,8 @@ def __init__(
         to a concrete filepath.
 
         Args:
-            filepath: path to sequence file prefixed with a protocol like `s3://`.
-                If prefix is not provided, `file` protocol (local filesystem) will be used.
+            filepath: Filepath in POSIX format to sequence file prefixed with a protocol like
+                `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
             load_args: Options for parsing sequence files by Biopython ``SeqIO.parse()``.
             save_args: file format supported by Biopython ``SeqIO.write()``.

diff --git a/kedro/extras/datasets/dask/parquet_dataset.py b/kedro/extras/datasets/dask/parquet_dataset.py
@@ -86,7 +86,7 @@ def __init__(
         parquet files.
 
         Args:
-            filepath: Path to a parquet file
+            filepath: Filepath in POSIX format to a parquet file
                 parquet collection or the directory of a multipart parquet.
             load_args: Additional loading options `dask.dataframe.read_parquet`:
                 https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.read_parquet

diff --git a/kedro/extras/datasets/geopandas/geojson_dataset.py b/kedro/extras/datasets/geopandas/geojson_dataset.py
@@ -90,8 +90,8 @@ def __init__(
 
         Args:
 
-            filepath: Filepath to a GeoJSON file prefixed with a protocol like `s3://`.
-                If prefix is not provided `file` protocol (local filesystem) will be used.
+            filepath: Filepath in POSIX format to a GeoJSON file prefixed with a protocol like
+                `s3://`. If prefix is not provided `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
                 Note: `http(s)` doesn't support versioning.
             load_args: GeoPandas options for loading GeoJSON files.

diff --git a/kedro/extras/datasets/holoviews/holoviews_writer.py b/kedro/extras/datasets/holoviews/holoviews_writer.py
@@ -79,7 +79,7 @@ def __init__(
         """Creates a new instance of ``HoloviewsWriter``.
 
         Args:
-            filepath: Filepath to a text file prefixed with a protocol like `s3://`.
+            filepath: Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.
                 If prefix is not provided, `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
                 Note: `http(s)` doesn't support versioning.

diff --git a/kedro/extras/datasets/matplotlib/matplotlib_writer.py b/kedro/extras/datasets/matplotlib/matplotlib_writer.py
@@ -103,9 +103,9 @@ def __init__(
         """Creates a new instance of ``MatplotlibWriter``.
 
         Args:
-            filepath: Key path to a matplot object file(s) prefixed with a protocol like `s3://`.
-                If prefix is not provided, `file` protocol (local filesystem) will be used.
-                The prefix should be any protocol supported by ``fsspec``.
+            filepath: Filepath in POSIX format to a matplot object file(s) prefixed with a protocol
+                like `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be
+                used. The prefix should be any protocol supported by ``fsspec``.
             fs_args: Extra arguments to pass into underlying filesystem class constructor
                 (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as
                 to pass to the filesystem's `open` method through nested key `open_args_save`.

diff --git a/kedro/extras/datasets/networkx/networkx_dataset.py b/kedro/extras/datasets/networkx/networkx_dataset.py
@@ -82,7 +82,7 @@ def __init__(
         """Creates a new instance of ``NetworkXDataSet``.
 
         Args:
-            filepath: The path to the NetworkX graph JSON file.
+            filepath: Filepath in POSIX format to the NetworkX graph JSON file.
             load_args: Arguments passed on to ```networkx.node_link_graph``.
                 See the details in
                 https://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_graph.html

diff --git a/kedro/extras/datasets/pandas/appendable_excel_dataset.py b/kedro/extras/datasets/pandas/appendable_excel_dataset.py
@@ -30,7 +30,7 @@
 It uses pandas to handle the Excel file.
 """
 from copy import deepcopy
-from pathlib import Path, PurePath
+from pathlib import Path, PurePosixPath
 from typing import Any, Dict
 
 import pandas as pd
@@ -81,7 +81,7 @@ def __init__(
         Excel file to be opened in append mode.
 
         Args:
-            filepath: Filepath to an existing local Excel file.
+            filepath: Filepath in POSIX format to an existing local Excel file.
             load_args: Pandas options for loading Excel files.
                 Here you can find all available arguments:
                 https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html
@@ -96,7 +96,7 @@ def __init__(
                 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html
                 Note: `mode` option of `ExcelWriter` is set to `a` and it can not be overridden.
         """
-        self._filepath = PurePath(filepath)
+        self._filepath = PurePosixPath(filepath)
 
         # Handle default load and save arguments
         self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS)
@@ -136,4 +136,4 @@ def _save(self, data: pd.DataFrame) -> None:
             )
 
     def _exists(self) -> bool:
-        return Path(self._filepath).is_file()
+        return Path(self._filepath.as_posix()).is_file()
diff --git a/kedro/extras/datasets/pandas/csv_dataset.py b/kedro/extras/datasets/pandas/csv_dataset.py
@@ -83,7 +83,7 @@ def __init__(
         on a specific filesystem.
 
         Args:
-            filepath: Filepath to a CSV file prefixed with a protocol like `s3://`.
+            filepath: Filepath in POSIX format to a CSV file prefixed with a protocol like `s3://`.
                 If prefix is not provided, `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
                 Note: `http(s)` doesn't support versioning.

diff --git a/kedro/extras/datasets/pandas/excel_dataset.py b/kedro/extras/datasets/pandas/excel_dataset.py
@@ -85,8 +85,8 @@ def __init__(
         on a specific filesystem.
 
         Args:
-            filepath: Filepath to a Excel file prefixed with a protocol like `s3://`.
-                If prefix is not provided, `file` protocol (local filesystem) will be used.
+            filepath: Filepath in POSIX format to a Excel file prefixed with a protocol like
+                `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
                 Note: `http(s)` doesn't support versioning.
             engine: The engine used to write to excel files. The default

diff --git a/kedro/extras/datasets/pandas/feather_dataset.py b/kedro/extras/datasets/pandas/feather_dataset.py
@@ -87,8 +87,8 @@ def __init__(
         filepath.
 
         Args:
-            filepath: Filepath to a feather file prefixed with a protocol like `s3://`.
-                If prefix is not provided, `file` protocol (local filesystem) will be used.
+            filepath: Filepath in POSIX format to a feather file prefixed with a protocol like
+                `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
                 Note: `http(s)` doesn't support versioning.
             load_args: Pandas options for loading feather files.

diff --git a/kedro/extras/datasets/pandas/hdf_dataset.py b/kedro/extras/datasets/pandas/hdf_dataset.py
@@ -90,7 +90,7 @@ def __init__(
         on a specific filesystem.
 
         Args:
-            filepath: Filepath to a hdf file prefixed with a protocol like `s3://`.
+            filepath: Filepath in POSIX format to a hdf file prefixed with a protocol like `s3://`.
                 If prefix is not provided, `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
                 Note: `http(s)` doesn't support versioning.

diff --git a/kedro/extras/datasets/pandas/json_dataset.py b/kedro/extras/datasets/pandas/json_dataset.py
@@ -83,7 +83,7 @@ def __init__(
         on a specific filesystem.
 
         Args:
-            filepath: Filepath to a JSON file prefixed with a protocol like `s3://`.
+            filepath: Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.
                 If prefix is not provided `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
                 Note: `http(s)` doesn't support versioning.

diff --git a/kedro/extras/datasets/pandas/parquet_dataset.py b/kedro/extras/datasets/pandas/parquet_dataset.py
@@ -30,7 +30,7 @@
 filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Parquet file.
 """
 from copy import deepcopy
-from pathlib import PurePosixPath
+from pathlib import Path, PurePosixPath
 from typing import Any, Dict
 
 import fsspec
@@ -85,8 +85,8 @@ def __init__(
         on a specific filesystem.
 
         Args:
-            filepath: Filepath to a Parquet file prefixed with a protocol like `s3://`.
-                If prefix is not provided, `file` protocol (local filesystem) will be used.
+            filepath: Filepath in POSIX format to a Parquet file prefixed with a protocol like
+                `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
                 It can also be a path to a directory. If the directory is
                 provided then it can be used for reading partitioned parquet files.
@@ -175,18 +175,16 @@ def _load(self) -> pd.DataFrame:
     def _save(self, data: pd.DataFrame) -> None:
         save_path = get_filepath_str(self._get_save_path(), self._protocol)
 
-        try:
-            table = pa.Table.from_pandas(data, **self._from_pandas_args)
-            pq.write_table(
-                table=table, where=save_path, filesystem=self._fs, **self._save_args
-            )
-        except IsADirectoryError as err:
+        if Path(save_path).is_dir():
             raise DataSetError(
-                "Saving {} to a directory is not supported. \n{}".format(
-                    self.__class__.__name__, str(err)
-                )
+                f"Saving {self.__class__.__name__} to a directory is not supported."
             )
 
+        table = pa.Table.from_pandas(data, **self._from_pandas_args)
+        pq.write_table(
+            table=table, where=save_path, filesystem=self._fs, **self._save_args
+        )
+
         self._invalidate_cache()
 
     def _exists(self) -> bool:

diff --git a/kedro/extras/datasets/pickle/pickle_dataset.py b/kedro/extras/datasets/pickle/pickle_dataset.py
@@ -95,8 +95,8 @@ def __init__(
         serialize/deserialize objects: `pickle` and `joblib`.
 
         Args:
-            filepath: Filepath to a Pickle file prefixed with a protocol like `s3://`.
-                If prefix is not provided, `file` protocol (local filesystem) will be used.
+            filepath: Filepath in POSIX format to a Pickle file prefixed with a protocol like
+                `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
                 Note: `http(s)` doesn't support versioning.
             backend: Backend to use, must be one of ['pickle', 'joblib']. Defaults to 'pickle'.

diff --git a/kedro/extras/datasets/pillow/image_dataset.py b/kedro/extras/datasets/pillow/image_dataset.py
@@ -76,8 +76,8 @@ def __init__(
         on a specific filesystem.
 
         Args:
-            filepath: Filepath to an image file prefixed with a protocol like `s3://`.
-                If prefix is not provided, `file` protocol (local filesystem) will be used.
+            filepath: Filepath in POSIX format to an image file prefixed with a protocol like
+                `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
                 Note: `http(s)` doesn't support versioning.
             save_args: Pillow options for saving image files.

diff --git a/kedro/extras/datasets/spark/spark_dataset.py b/kedro/extras/datasets/spark/spark_dataset.py
@@ -33,7 +33,7 @@
 from copy import deepcopy
 from fnmatch import fnmatch
 from functools import partial
-from pathlib import PurePath, PurePosixPath
+from pathlib import PurePosixPath
 from typing import Any, Dict, List, Optional, Tuple
 from warnings import warn
 
@@ -199,7 +199,7 @@ def __init__(  # pylint: disable=too-many-arguments
         """Creates a new instance of ``SparkDataSet``.
 
         Args:
-            filepath: Path to a Spark dataframe. When using Databricks
+            filepath: Filepath in POSIX format to a Spark dataframe. When using Databricks
                 and working with data written to mount path points,
                 specify ``filepath``s for (versioned) ``SparkDataSet``s
                 starting with ``/dbfs/mnt``.
@@ -265,11 +265,9 @@ def __init__(  # pylint: disable=too-many-arguments
             path = PurePosixPath(filepath)
 
         else:
-            path = PurePath(filepath)  # type: ignore
+            path = PurePosixPath(filepath)
 
             if filepath.startswith("/dbfs"):
-                # Use PosixPath if the filepath references DBFS
-                path = PurePosixPath(filepath)
                 dbutils = _get_dbutils(self._get_spark())
                 if dbutils:
                     glob_function = partial(_dbfs_glob, dbutils=dbutils)

diff --git a/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py b/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py
@@ -85,8 +85,8 @@ def __init__(
         """Creates a new instance of ``TensorFlowModelDataset``.
 
         Args:
-            filepath: Filepath to a TensorFlow model directory prefixed with a protocol
-                like `s3://`. If prefix is not provided `file` protocol (local filesystem)
+            filepath: Filepath in POSIX format to a TensorFlow model directory prefixed with a
+                protocol like `s3://`. If prefix is not provided `file` protocol (local filesystem)
                 will be used. The prefix should be any protocol supported by ``fsspec``.
                 Note: `http(s)` doesn't support versioning.
             load_args: TensorFlow options for loading models.

diff --git a/kedro/extras/datasets/text/text_dataset.py b/kedro/extras/datasets/text/text_dataset.py
@@ -74,7 +74,7 @@ def __init__(
         on a specific filesystem.
 
         Args:
-            filepath: Filepath to a text file prefixed with a protocol like `s3://`.
+            filepath: Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.
                 If prefix is not provided, `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
                 Note: `http(s)` doesn't support versioning.

diff --git a/kedro/extras/datasets/yaml/yaml_dataset.py b/kedro/extras/datasets/yaml/yaml_dataset.py
@@ -80,7 +80,7 @@ def __init__(
         on a specific filesystem.
 
         Args:
-            filepath: Filepath to a YAML file prefixed with a protocol like `s3://`.
+            filepath: Filepath in POSIX format to a YAML file prefixed with a protocol like `s3://`.
                 If prefix is not provided, `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
                 Note: `http(s)` doesn't support versioning.