From 390c02fbaf3a2801f06b5bbab3e7ec3650785c56 Mon Sep 17 00:00:00 2001 From: Andrii Ivaniuk Date: Fri, 26 Jun 2020 15:10:29 +0300 Subject: [PATCH] [KED-1796] Fixed versioning on Windows (#673) --- RELEASE.md | 1 + docs/source/04_user_guide/04_data_catalog.md | 1 + kedro/context/context.py | 29 +++++--- .../biosequence/biosequence_dataset.py | 4 +- kedro/extras/datasets/dask/parquet_dataset.py | 2 +- .../datasets/geopandas/geojson_dataset.py | 4 +- .../datasets/holoviews/holoviews_writer.py | 2 +- .../datasets/matplotlib/matplotlib_writer.py | 6 +- .../datasets/networkx/networkx_dataset.py | 2 +- .../pandas/appendable_excel_dataset.py | 8 +- kedro/extras/datasets/pandas/csv_dataset.py | 2 +- kedro/extras/datasets/pandas/excel_dataset.py | 4 +- .../extras/datasets/pandas/feather_dataset.py | 4 +- kedro/extras/datasets/pandas/hdf_dataset.py | 2 +- kedro/extras/datasets/pandas/json_dataset.py | 2 +- .../extras/datasets/pandas/parquet_dataset.py | 22 +++--- .../extras/datasets/pickle/pickle_dataset.py | 4 +- kedro/extras/datasets/pillow/image_dataset.py | 4 +- kedro/extras/datasets/spark/spark_dataset.py | 8 +- .../tensorflow/tensorflow_model_dataset.py | 4 +- kedro/extras/datasets/text/text_dataset.py | 2 +- kedro/extras/datasets/yaml/yaml_dataset.py | 2 +- kedro/framework/context/context.py | 29 +++++--- kedro/io/core.py | 18 ++--- tests/context/test_context.py | 59 +++++++++------ .../datasets/geojson/test_geojson_dataset.py | 10 +-- .../holoviews/test_holoviews_writer.py | 6 +- .../matplotlib/test_matplotlib_writer.py | 10 +-- .../networkx/test_networkx_dataset.py | 2 +- .../pandas/test_appendable_excel_dataset.py | 10 +-- .../datasets/pandas/test_csv_dataset.py | 5 +- .../datasets/pandas/test_excel_dataset.py | 2 +- .../datasets/pandas/test_feather_dataset.py | 2 +- .../datasets/pandas/test_hdf_dataset.py | 2 +- .../datasets/pandas/test_json_dataset.py | 2 +- .../datasets/pandas/test_parquet_dataset.py | 8 +- .../datasets/pickle/test_pickle_dataset.py | 2 +- .../datasets/pillow/test_image_dataset.py | 6 +- .../datasets/spark/test_spark_dataset.py | 73 +++++++++---------- .../test_tensorflow_model_dataset.py | 2 +- .../extras/datasets/text/test_text_dataset.py | 2 +- .../extras/datasets/yaml/test_yaml_dataset.py | 2 +- tests/framework/context/test_context.py | 59 +++++++++------ tests/framework/hooks/test_context_hooks.py | 4 +- tests/io/test_data_catalog.py | 2 +- 45 files changed, 245 insertions(+), 191 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index ecc057697c..26cf259002 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -17,6 +17,7 @@ ## Bug fixes and other changes * Removed `/src/nodes` directory from the project template and made `kedro jupyter convert` create it on the fly if necessary. * Fixed a bug in `MatplotlibWriter` which prevented saving lists and dictionaries of plots locally on Windows. +* Fixed broken versioning for Windows paths. * Fixed `DataSet` string representation for falsy values. * Improved the error message when duplicate nodes are passed to the `Pipeline` initializer. diff --git a/docs/source/04_user_guide/04_data_catalog.md b/docs/source/04_user_guide/04_data_catalog.md index 5900585d29..5dbb29080e 100644 --- a/docs/source/04_user_guide/04_data_catalog.md +++ b/docs/source/04_user_guide/04_data_catalog.md @@ -28,6 +28,7 @@ The are two ways of defining a Data Catalog through the use of YAML configuratio ## Specifying the location of the dataset Kedro relies on [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) for reading and saving data from a variety of data stores including local file systems, network file systems, cloud object stores, and Hadoop. When specifying a storage location in `filepath:`, a URL should be provided using the general form `protocol://path/to/data`. If no protocol is provided, the local file system is assumed (same as ``file://``). +> *Note:* all the `filepath`s will be automatically converted to POSIX format but it is recommended to use POSIX format by default (e.g. Windows path: `C:/users/kedro/project/data/01_raw/bikes.csv`). The following prepends are available: - **Local or Network File System**: `file://` - the local file system is default in the absence of any protocol, it also permits relative paths. diff --git a/kedro/context/context.py b/kedro/context/context.py index a133e21072..dd683d1c59 100644 --- a/kedro/context/context.py +++ b/kedro/context/context.py @@ -99,15 +99,18 @@ def _is_relative_path(path_string: str) -> bool: return True -def _expand_path(project_path: Path, conf_dictionary: Dict[str, Any]) -> Dict[str, Any]: - """Turn all relative paths inside ``conf_dictionary`` into absolute paths by appending them to - ``project_path``. This is a hack to make sure that we don't have to change user's - working directory for logging and datasets to work. It is important for non-standard workflows - such as IPython notebook where users don't go through `kedro run` or `run.py` entrypoints. +def _convert_paths_to_absolute_posix( + project_path: Path, conf_dictionary: Dict[str, Any] +) -> Dict[str, Any]: + """Turn all relative paths inside ``conf_dictionary`` into absolute paths by appending them + to ``project_path`` and convert absolute Windows paths to POSIX format. This is a hack to + make sure that we don't have to change user's working directory for logging and datasets to + work. It is important for non-standard workflows such as IPython notebook where users don't go + through `kedro run` or `run.py` entrypoints. Example: :: - >>> conf = _expand_path( + >>> conf = _convert_paths_to_absolute_posix( >>> project_path=Path("/path/to/my/project"), >>> conf_dictionary={ >>> "handlers": { @@ -140,7 +143,9 @@ def _expand_path(project_path: Path, conf_dictionary: Dict[str, Any]) -> Dict[st # if the conf_value is another dictionary, absolutify its paths first. if isinstance(conf_value, dict): - conf_dictionary[conf_key] = _expand_path(project_path, conf_value) + conf_dictionary[conf_key] = _convert_paths_to_absolute_posix( + project_path, conf_value + ) continue # if the conf_value is not a dictionary nor a string, skip @@ -152,8 +157,12 @@ def _expand_path(project_path: Path, conf_dictionary: Dict[str, Any]) -> Dict[st continue if _is_relative_path(conf_value): - conf_value_absolute_path = str(project_path / conf_value) + # Absolute local path should be in POSIX format + conf_value_absolute_path = (project_path / conf_value).as_posix() conf_dictionary[conf_key] = conf_value_absolute_path + elif PureWindowsPath(conf_value).drive: + # Convert absolute Windows path to POSIX format + conf_dictionary[conf_key] = PureWindowsPath(conf_value).as_posix() return conf_dictionary @@ -400,7 +409,7 @@ def _get_catalog( conf_catalog = self.config_loader.get("catalog*", "catalog*/**", "**/catalog*") # turn relative paths in conf_catalog into absolute paths # before initializing the catalog - conf_catalog = _expand_path( + conf_catalog = _convert_paths_to_absolute_posix( project_path=self.project_path, conf_dictionary=conf_catalog ) conf_creds = self._get_config_credentials() @@ -495,7 +504,7 @@ def _setup_logging(self) -> None: """Register logging specified in logging directory.""" conf_logging = self.config_loader.get("logging*", "logging*/**") # turn relative paths in logging config into absolute path before initialising loggers - conf_logging = _expand_path( + conf_logging = _convert_paths_to_absolute_posix( project_path=self.project_path, conf_dictionary=conf_logging ) logging.config.dictConfig(conf_logging) diff --git a/kedro/extras/datasets/biosequence/biosequence_dataset.py b/kedro/extras/datasets/biosequence/biosequence_dataset.py index ff4f26e191..1774ec3194 100644 --- a/kedro/extras/datasets/biosequence/biosequence_dataset.py +++ b/kedro/extras/datasets/biosequence/biosequence_dataset.py @@ -82,8 +82,8 @@ def __init__( to a concrete filepath. Args: - filepath: path to sequence file prefixed with a protocol like `s3://`. - If prefix is not provided, `file` protocol (local filesystem) will be used. + filepath: Filepath in POSIX format to sequence file prefixed with a protocol like + `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. load_args: Options for parsing sequence files by Biopython ``SeqIO.parse()``. save_args: file format supported by Biopython ``SeqIO.write()``. diff --git a/kedro/extras/datasets/dask/parquet_dataset.py b/kedro/extras/datasets/dask/parquet_dataset.py index 15629c62ab..3b347f0aaf 100644 --- a/kedro/extras/datasets/dask/parquet_dataset.py +++ b/kedro/extras/datasets/dask/parquet_dataset.py @@ -86,7 +86,7 @@ def __init__( parquet files. Args: - filepath: Path to a parquet file + filepath: Filepath in POSIX format to a parquet file parquet collection or the directory of a multipart parquet. load_args: Additional loading options `dask.dataframe.read_parquet`: https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.read_parquet diff --git a/kedro/extras/datasets/geopandas/geojson_dataset.py b/kedro/extras/datasets/geopandas/geojson_dataset.py index 1f82703a61..4d508fb161 100644 --- a/kedro/extras/datasets/geopandas/geojson_dataset.py +++ b/kedro/extras/datasets/geopandas/geojson_dataset.py @@ -90,8 +90,8 @@ def __init__( Args: - filepath: Filepath to a GeoJSON file prefixed with a protocol like `s3://`. - If prefix is not provided `file` protocol (local filesystem) will be used. + filepath: Filepath in POSIX format to a GeoJSON file prefixed with a protocol like + `s3://`. If prefix is not provided `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. load_args: GeoPandas options for loading GeoJSON files. diff --git a/kedro/extras/datasets/holoviews/holoviews_writer.py b/kedro/extras/datasets/holoviews/holoviews_writer.py index 4a96c57d5b..26c50fb171 100644 --- a/kedro/extras/datasets/holoviews/holoviews_writer.py +++ b/kedro/extras/datasets/holoviews/holoviews_writer.py @@ -79,7 +79,7 @@ def __init__( """Creates a new instance of ``HoloviewsWriter``. Args: - filepath: Filepath to a text file prefixed with a protocol like `s3://`. + filepath: Filepath in POSIX format to a text file prefixed with a protocol like `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. diff --git a/kedro/extras/datasets/matplotlib/matplotlib_writer.py b/kedro/extras/datasets/matplotlib/matplotlib_writer.py index 67a108c7d5..54fbbe0e1a 100644 --- a/kedro/extras/datasets/matplotlib/matplotlib_writer.py +++ b/kedro/extras/datasets/matplotlib/matplotlib_writer.py @@ -103,9 +103,9 @@ def __init__( """Creates a new instance of ``MatplotlibWriter``. Args: - filepath: Key path to a matplot object file(s) prefixed with a protocol like `s3://`. - If prefix is not provided, `file` protocol (local filesystem) will be used. - The prefix should be any protocol supported by ``fsspec``. + filepath: Filepath in POSIX format to a matplot object file(s) prefixed with a protocol + like `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be + used. The prefix should be any protocol supported by ``fsspec``. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as to pass to the filesystem's `open` method through nested key `open_args_save`. diff --git a/kedro/extras/datasets/networkx/networkx_dataset.py b/kedro/extras/datasets/networkx/networkx_dataset.py index 0ae03c8118..67d992ee99 100644 --- a/kedro/extras/datasets/networkx/networkx_dataset.py +++ b/kedro/extras/datasets/networkx/networkx_dataset.py @@ -82,7 +82,7 @@ def __init__( """Creates a new instance of ``NetworkXDataSet``. Args: - filepath: The path to the NetworkX graph JSON file. + filepath: Filepath in POSIX format to the NetworkX graph JSON file. load_args: Arguments passed on to ```networkx.node_link_graph``. See the details in https://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_graph.html diff --git a/kedro/extras/datasets/pandas/appendable_excel_dataset.py b/kedro/extras/datasets/pandas/appendable_excel_dataset.py index 87d585b57d..f4fd8cc9ca 100644 --- a/kedro/extras/datasets/pandas/appendable_excel_dataset.py +++ b/kedro/extras/datasets/pandas/appendable_excel_dataset.py @@ -30,7 +30,7 @@ It uses pandas to handle the Excel file. """ from copy import deepcopy -from pathlib import Path, PurePath +from pathlib import Path, PurePosixPath from typing import Any, Dict import pandas as pd @@ -81,7 +81,7 @@ def __init__( Excel file to be opened in append mode. Args: - filepath: Filepath to an existing local Excel file. + filepath: Filepath in POSIX format to an existing local Excel file. load_args: Pandas options for loading Excel files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html @@ -96,7 +96,7 @@ def __init__( https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html Note: `mode` option of `ExcelWriter` is set to `a` and it can not be overridden. """ - self._filepath = PurePath(filepath) + self._filepath = PurePosixPath(filepath) # Handle default load and save arguments self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) @@ -136,4 +136,4 @@ def _save(self, data: pd.DataFrame) -> None: ) def _exists(self) -> bool: - return Path(self._filepath).is_file() + return Path(self._filepath.as_posix()).is_file() diff --git a/kedro/extras/datasets/pandas/csv_dataset.py b/kedro/extras/datasets/pandas/csv_dataset.py index ba77cc05f7..d301dd0ef1 100644 --- a/kedro/extras/datasets/pandas/csv_dataset.py +++ b/kedro/extras/datasets/pandas/csv_dataset.py @@ -83,7 +83,7 @@ def __init__( on a specific filesystem. Args: - filepath: Filepath to a CSV file prefixed with a protocol like `s3://`. + filepath: Filepath in POSIX format to a CSV file prefixed with a protocol like `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. diff --git a/kedro/extras/datasets/pandas/excel_dataset.py b/kedro/extras/datasets/pandas/excel_dataset.py index d720bd2f92..24a1ad2b9f 100644 --- a/kedro/extras/datasets/pandas/excel_dataset.py +++ b/kedro/extras/datasets/pandas/excel_dataset.py @@ -85,8 +85,8 @@ def __init__( on a specific filesystem. Args: - filepath: Filepath to a Excel file prefixed with a protocol like `s3://`. - If prefix is not provided, `file` protocol (local filesystem) will be used. + filepath: Filepath in POSIX format to a Excel file prefixed with a protocol like + `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. engine: The engine used to write to excel files. The default diff --git a/kedro/extras/datasets/pandas/feather_dataset.py b/kedro/extras/datasets/pandas/feather_dataset.py index 6b668d113b..1598ad089b 100644 --- a/kedro/extras/datasets/pandas/feather_dataset.py +++ b/kedro/extras/datasets/pandas/feather_dataset.py @@ -87,8 +87,8 @@ def __init__( filepath. Args: - filepath: Filepath to a feather file prefixed with a protocol like `s3://`. - If prefix is not provided, `file` protocol (local filesystem) will be used. + filepath: Filepath in POSIX format to a feather file prefixed with a protocol like + `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. load_args: Pandas options for loading feather files. diff --git a/kedro/extras/datasets/pandas/hdf_dataset.py b/kedro/extras/datasets/pandas/hdf_dataset.py index fcc9dafcfc..64a258ec17 100644 --- a/kedro/extras/datasets/pandas/hdf_dataset.py +++ b/kedro/extras/datasets/pandas/hdf_dataset.py @@ -90,7 +90,7 @@ def __init__( on a specific filesystem. Args: - filepath: Filepath to a hdf file prefixed with a protocol like `s3://`. + filepath: Filepath in POSIX format to a hdf file prefixed with a protocol like `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. diff --git a/kedro/extras/datasets/pandas/json_dataset.py b/kedro/extras/datasets/pandas/json_dataset.py index 5fc5a8a5c1..6f34e157d7 100644 --- a/kedro/extras/datasets/pandas/json_dataset.py +++ b/kedro/extras/datasets/pandas/json_dataset.py @@ -83,7 +83,7 @@ def __init__( on a specific filesystem. Args: - filepath: Filepath to a JSON file prefixed with a protocol like `s3://`. + filepath: Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`. If prefix is not provided `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. diff --git a/kedro/extras/datasets/pandas/parquet_dataset.py b/kedro/extras/datasets/pandas/parquet_dataset.py index 618b48f4b2..364d291f5c 100644 --- a/kedro/extras/datasets/pandas/parquet_dataset.py +++ b/kedro/extras/datasets/pandas/parquet_dataset.py @@ -30,7 +30,7 @@ filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Parquet file. """ from copy import deepcopy -from pathlib import PurePosixPath +from pathlib import Path, PurePosixPath from typing import Any, Dict import fsspec @@ -85,8 +85,8 @@ def __init__( on a specific filesystem. Args: - filepath: Filepath to a Parquet file prefixed with a protocol like `s3://`. - If prefix is not provided, `file` protocol (local filesystem) will be used. + filepath: Filepath in POSIX format to a Parquet file prefixed with a protocol like + `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. It can also be a path to a directory. If the directory is provided then it can be used for reading partitioned parquet files. @@ -175,18 +175,16 @@ def _load(self) -> pd.DataFrame: def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - try: - table = pa.Table.from_pandas(data, **self._from_pandas_args) - pq.write_table( - table=table, where=save_path, filesystem=self._fs, **self._save_args - ) - except IsADirectoryError as err: + if Path(save_path).is_dir(): raise DataSetError( - "Saving {} to a directory is not supported. \n{}".format( - self.__class__.__name__, str(err) - ) + f"Saving {self.__class__.__name__} to a directory is not supported." ) + table = pa.Table.from_pandas(data, **self._from_pandas_args) + pq.write_table( + table=table, where=save_path, filesystem=self._fs, **self._save_args + ) + self._invalidate_cache() def _exists(self) -> bool: diff --git a/kedro/extras/datasets/pickle/pickle_dataset.py b/kedro/extras/datasets/pickle/pickle_dataset.py index af4b19b683..40d74e7837 100644 --- a/kedro/extras/datasets/pickle/pickle_dataset.py +++ b/kedro/extras/datasets/pickle/pickle_dataset.py @@ -95,8 +95,8 @@ def __init__( serialize/deserialize objects: `pickle` and `joblib`. Args: - filepath: Filepath to a Pickle file prefixed with a protocol like `s3://`. - If prefix is not provided, `file` protocol (local filesystem) will be used. + filepath: Filepath in POSIX format to a Pickle file prefixed with a protocol like + `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. backend: Backend to use, must be one of ['pickle', 'joblib']. Defaults to 'pickle'. diff --git a/kedro/extras/datasets/pillow/image_dataset.py b/kedro/extras/datasets/pillow/image_dataset.py index 5dfa22c007..9379513d22 100644 --- a/kedro/extras/datasets/pillow/image_dataset.py +++ b/kedro/extras/datasets/pillow/image_dataset.py @@ -76,8 +76,8 @@ def __init__( on a specific filesystem. Args: - filepath: Filepath to an image file prefixed with a protocol like `s3://`. - If prefix is not provided, `file` protocol (local filesystem) will be used. + filepath: Filepath in POSIX format to an image file prefixed with a protocol like + `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. save_args: Pillow options for saving image files. diff --git a/kedro/extras/datasets/spark/spark_dataset.py b/kedro/extras/datasets/spark/spark_dataset.py index c432080c06..bb422424d0 100644 --- a/kedro/extras/datasets/spark/spark_dataset.py +++ b/kedro/extras/datasets/spark/spark_dataset.py @@ -33,7 +33,7 @@ from copy import deepcopy from fnmatch import fnmatch from functools import partial -from pathlib import PurePath, PurePosixPath +from pathlib import PurePosixPath from typing import Any, Dict, List, Optional, Tuple from warnings import warn @@ -199,7 +199,7 @@ def __init__( # pylint: disable=too-many-arguments """Creates a new instance of ``SparkDataSet``. Args: - filepath: Path to a Spark dataframe. When using Databricks + filepath: Filepath in POSIX format to a Spark dataframe. When using Databricks and working with data written to mount path points, specify ``filepath``s for (versioned) ``SparkDataSet``s starting with ``/dbfs/mnt``. @@ -265,11 +265,9 @@ def __init__( # pylint: disable=too-many-arguments path = PurePosixPath(filepath) else: - path = PurePath(filepath) # type: ignore + path = PurePosixPath(filepath) if filepath.startswith("/dbfs"): - # Use PosixPath if the filepath references DBFS - path = PurePosixPath(filepath) dbutils = _get_dbutils(self._get_spark()) if dbutils: glob_function = partial(_dbfs_glob, dbutils=dbutils) diff --git a/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py b/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py index 7a1c8143b7..154fbe1704 100644 --- a/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py @@ -85,8 +85,8 @@ def __init__( """Creates a new instance of ``TensorFlowModelDataset``. Args: - filepath: Filepath to a TensorFlow model directory prefixed with a protocol - like `s3://`. If prefix is not provided `file` protocol (local filesystem) + filepath: Filepath in POSIX format to a TensorFlow model directory prefixed with a + protocol like `s3://`. If prefix is not provided `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. load_args: TensorFlow options for loading models. diff --git a/kedro/extras/datasets/text/text_dataset.py b/kedro/extras/datasets/text/text_dataset.py index 63792e631a..6f7cdce1b2 100644 --- a/kedro/extras/datasets/text/text_dataset.py +++ b/kedro/extras/datasets/text/text_dataset.py @@ -74,7 +74,7 @@ def __init__( on a specific filesystem. Args: - filepath: Filepath to a text file prefixed with a protocol like `s3://`. + filepath: Filepath in POSIX format to a text file prefixed with a protocol like `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. diff --git a/kedro/extras/datasets/yaml/yaml_dataset.py b/kedro/extras/datasets/yaml/yaml_dataset.py index ad52a64816..2220da953a 100644 --- a/kedro/extras/datasets/yaml/yaml_dataset.py +++ b/kedro/extras/datasets/yaml/yaml_dataset.py @@ -80,7 +80,7 @@ def __init__( on a specific filesystem. Args: - filepath: Filepath to a YAML file prefixed with a protocol like `s3://`. + filepath: Filepath in POSIX format to a YAML file prefixed with a protocol like `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used. The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. diff --git a/kedro/framework/context/context.py b/kedro/framework/context/context.py index 717dec9c95..7b88b72732 100644 --- a/kedro/framework/context/context.py +++ b/kedro/framework/context/context.py @@ -97,15 +97,18 @@ def _is_relative_path(path_string: str) -> bool: return True -def _expand_path(project_path: Path, conf_dictionary: Dict[str, Any]) -> Dict[str, Any]: - """Turn all relative paths inside ``conf_dictionary`` into absolute paths by appending them to - ``project_path``. This is a hack to make sure that we don't have to change user's - working directory for logging and datasets to work. It is important for non-standard workflows - such as IPython notebook where users don't go through `kedro run` or `run.py` entrypoints. +def _convert_paths_to_absolute_posix( + project_path: Path, conf_dictionary: Dict[str, Any] +) -> Dict[str, Any]: + """Turn all relative paths inside ``conf_dictionary`` into absolute paths by appending them + to ``project_path`` and convert absolute Windows paths to POSIX format. This is a hack to + make sure that we don't have to change user's working directory for logging and datasets to + work. It is important for non-standard workflows such as IPython notebook where users don't go + through `kedro run` or `run.py` entrypoints. Example: :: - >>> conf = _expand_path( + >>> conf = _convert_paths_to_absolute_posix( >>> project_path=Path("/path/to/my/project"), >>> conf_dictionary={ >>> "handlers": { @@ -138,7 +141,9 @@ def _expand_path(project_path: Path, conf_dictionary: Dict[str, Any]) -> Dict[st # if the conf_value is another dictionary, absolutify its paths first. if isinstance(conf_value, dict): - conf_dictionary[conf_key] = _expand_path(project_path, conf_value) + conf_dictionary[conf_key] = _convert_paths_to_absolute_posix( + project_path, conf_value + ) continue # if the conf_value is not a dictionary nor a string, skip @@ -150,8 +155,12 @@ def _expand_path(project_path: Path, conf_dictionary: Dict[str, Any]) -> Dict[st continue if _is_relative_path(conf_value): - conf_value_absolute_path = str(project_path / conf_value) + # Absolute local path should be in POSIX format + conf_value_absolute_path = (project_path / conf_value).as_posix() conf_dictionary[conf_key] = conf_value_absolute_path + elif PureWindowsPath(conf_value).drive: + # Convert absolute Windows path to POSIX format + conf_dictionary[conf_key] = PureWindowsPath(conf_value).as_posix() return conf_dictionary @@ -396,7 +405,7 @@ def _get_catalog( conf_catalog = self.config_loader.get("catalog*", "catalog*/**", "**/catalog*") # turn relative paths in conf_catalog into absolute paths # before initializing the catalog - conf_catalog = _expand_path( + conf_catalog = _convert_paths_to_absolute_posix( project_path=self.project_path, conf_dictionary=conf_catalog ) conf_creds = self._get_config_credentials() @@ -491,7 +500,7 @@ def _setup_logging(self) -> None: """Register logging specified in logging directory.""" conf_logging = self.config_loader.get("logging*", "logging*/**", "**/logging*") # turn relative paths in logging config into absolute path before initialising loggers - conf_logging = _expand_path( + conf_logging = _convert_paths_to_absolute_posix( project_path=self.project_path, conf_dictionary=conf_logging ) logging.config.dictConfig(conf_logging) diff --git a/kedro/io/core.py b/kedro/io/core.py index c5da014caf..6a9f3a7ff1 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -40,7 +40,7 @@ from functools import partial from glob import iglob from operator import attrgetter -from pathlib import Path, PurePath +from pathlib import Path, PurePath, PurePosixPath from typing import Any, Callable, Dict, List, Optional, Tuple, Type from urllib.parse import urlsplit @@ -120,7 +120,7 @@ class AbstractDataSet(abc.ABC): >>> df.to_csv(str(self._filepath)) >>> >>> def _exists(self) -> bool: - >>> return Path(self._filepath).exists() + >>> return Path(self._filepath.as_posix()).exists() >>> >>> def _describe(self): >>> return dict(param1=self._param1, param2=self._param2) @@ -505,7 +505,7 @@ class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): >>> >>> def _exists(self) -> bool: >>> path = self._get_load_path() - >>> return Path(path).exists() + >>> return Path(path.as_posix()).exists() >>> >>> def _describe(self): >>> return dict(version=self._version, param1=self._param1, param2=self._param2) @@ -523,7 +523,7 @@ class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): def __init__( self, - filepath: PurePath, + filepath: PurePosixPath, version: Optional[Version], exists_function: Callable[[str], bool] = None, glob_function: Callable[[str], List[str]] = None, @@ -531,7 +531,7 @@ def __init__( """Creates a new instance of ``AbstractVersionedDataSet``. Args: - filepath: Path to file. + filepath: Filepath in POSIX format to a file. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -580,7 +580,7 @@ def resolve_load_version(self) -> Optional[str]: return self._version.load return self._fetch_latest_load_version() - def _get_load_path(self) -> PurePath: + def _get_load_path(self) -> PurePosixPath: if not self._version: # When versioning is disabled, load from original filepath return self._filepath @@ -596,7 +596,7 @@ def resolve_save_version(self) -> Optional[str]: return self._version.save return self._fetch_latest_save_version() - def _get_save_path(self) -> PurePath: + def _get_save_path(self) -> PurePosixPath: if not self._version: # When versioning is disabled, return original filepath return self._filepath @@ -612,7 +612,7 @@ def _get_save_path(self) -> PurePath: return versioned_path - def _get_versioned_path(self, version: str) -> PurePath: + def _get_versioned_path(self, version: str) -> PurePosixPath: return self._filepath / version / self._filepath.name def load(self) -> Any: @@ -734,7 +734,7 @@ def get_filepath_str(path: PurePath, protocol: str) -> str: Returns: Filepath string. """ - path = str(path) + path = path.as_posix() if protocol in HTTP_PROTOCOLS: path = "".join((protocol, PROTOCOL_DELIMITER, path)) return path diff --git a/tests/context/test_context.py b/tests/context/test_context.py index a319f16e97..5ffde0a7fc 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -29,7 +29,8 @@ import configparser import json import re -from pathlib import Path, PurePosixPath, PureWindowsPath +from pathlib import Path, PurePath, PurePosixPath, PureWindowsPath +from time import sleep from typing import Any, Dict import pandas as pd @@ -41,7 +42,7 @@ from kedro.config import MissingConfigException from kedro.context import KedroContext, KedroContextError, validate_source_path from kedro.context.context import ( - _expand_path, + _convert_paths_to_absolute_posix, _is_relative_path, _validate_layers_for_transcoding, ) @@ -101,8 +102,8 @@ def _write_dummy_ini(filepath: Path): @pytest.fixture def base_config(tmp_path): - cars_filepath = str(tmp_path / "cars.csv") - trains_filepath = str(tmp_path / "trains.csv") + cars_filepath = (tmp_path / "cars.csv").as_posix() + trains_filepath = (tmp_path / "trains.csv").as_posix() return { "trains": {"type": "pandas.CSVDataSet", "filepath": trains_filepath}, @@ -116,8 +117,8 @@ def base_config(tmp_path): @pytest.fixture def local_config(tmp_path): - cars_filepath = str(tmp_path / "cars.csv") - boats_filepath = str(tmp_path / "boats.csv") + cars_filepath = (tmp_path / "cars.csv").as_posix() + boats_filepath = (tmp_path / "boats.csv").as_posix() # use one dataset with a relative filepath horses_filepath = "horses.csv" return { @@ -285,8 +286,11 @@ def test_get_catalog_always_using_absolute_path(self, dummy_context): # based on the project path catalog = dummy_context._get_catalog() ds_path = catalog._data_sets["horses"]._filepath - assert ds_path.is_absolute() - assert str(ds_path) == str(dummy_context._project_path / "horses.csv") + assert PurePath(ds_path.as_posix()).is_absolute() + assert ( + ds_path.as_posix() + == (dummy_context._project_path / "horses.csv").as_posix() + ) def test_get_catalog_validates_layers(self, dummy_context, mocker): mock_validate = mocker.patch( @@ -428,8 +432,9 @@ def test_setup_logging_using_absolute_path(self, tmp_path, mocker): mocked_dict_config = mocker.patch("logging.config.dictConfig") dummy_context = DummyContext(str(tmp_path)) called_args = mocked_dict_config.call_args[0][0] - assert called_args["info_file_handler"]["filename"] == str( - dummy_context._project_path / "logs" / "info.log" + assert ( + called_args["info_file_handler"]["filename"] + == (dummy_context._project_path / "logs" / "info.log").as_posix() ) @@ -567,7 +572,7 @@ def _get_pipelines(self) -> Dict[str, Pipeline]: mocker.patch("logging.config.dictConfig") dummy_context = DummyContext(str(tmp_path)) - filepath = str(dummy_context.project_path / "cars.csv") + filepath = (dummy_context.project_path / "cars.csv").as_posix() old_save_version = generate_timestamp() old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) @@ -578,6 +583,7 @@ def _get_pipelines(self) -> Dict[str, Pipeline]: ) old_csv_data_set.save(old_df) + sleep(0.5) new_save_version = generate_timestamp() new_csv_data_set = CSVDataSet( filepath=filepath, @@ -791,10 +797,10 @@ def test_is_relative_path(path_string: str, expected: bool): assert _is_relative_path(path_string) == expected -def test_expand_path_raises_value_error_on_relative_project_path(): +def test_convert_paths_raises_error_on_relative_project_path(): path = Path("relative/path") with pytest.raises(ValueError) as excinfo: - _expand_path(project_path=path, conf_dictionary={}) + _convert_paths_to_absolute_posix(project_path=path, conf_dictionary={}) assert ( str(excinfo.value) == f"project_path must be an absolute path. Received: {path}" @@ -816,8 +822,8 @@ def test_expand_path_raises_value_error_on_relative_project_path(): ), ( PureWindowsPath("C:\\kedro"), - {"my_dataset": {"path": "data\\01_raw\\dataset.json"}}, - {"my_dataset": {"path": "C:\\kedro\\data\\01_raw\\dataset.json"}}, + {"my_dataset": {"path": "data/01_raw/dataset.json"}}, + {"my_dataset": {"path": "C:/kedro/data/01_raw/dataset.json"}}, ), # test: the function shouldn't modify paths for key not associated with filepath ( @@ -827,10 +833,10 @@ def test_expand_path_raises_value_error_on_relative_project_path(): ), ], ) -def test_expand_path_for_all_known_filepath_keys( +def test_convert_paths_to_absolute_posix_for_all_known_filepath_keys( project_path: Path, input_conf: Dict[str, Any], expected: Dict[str, Any] ): - assert _expand_path(project_path, input_conf) == expected + assert _convert_paths_to_absolute_posix(project_path, input_conf) == expected @pytest.mark.parametrize( @@ -846,17 +852,28 @@ def test_expand_path_for_all_known_filepath_keys( {"my_dataset": {"filepath": "s3://data/01_raw/dataset.json"}}, {"my_dataset": {"filepath": "s3://data/01_raw/dataset.json"}}, ), + ], +) +def test_convert_paths_to_absolute_posix_not_changing_non_relative_path( + project_path: Path, input_conf: Dict[str, Any], expected: Dict[str, Any] +): + assert _convert_paths_to_absolute_posix(project_path, input_conf) == expected + + +@pytest.mark.parametrize( + "project_path,input_conf,expected", + [ ( PureWindowsPath("D:\\kedro"), - {"my_dataset": {"path": "C:\\data\\01_raw\\dataset.json"}}, - {"my_dataset": {"path": "C:\\data\\01_raw\\dataset.json"}}, + {"my_dataset": {"path": r"C:\data\01_raw\dataset.json"}}, + {"my_dataset": {"path": "C:/data/01_raw/dataset.json"}}, ), ], ) -def test_expand_path_not_changing_non_relative_path( +def test_convert_paths_to_absolute_posix_converts_full_windows_path_to_posix( project_path: Path, input_conf: Dict[str, Any], expected: Dict[str, Any] ): - assert _expand_path(project_path, input_conf) == expected + assert _convert_paths_to_absolute_posix(project_path, input_conf) == expected @pytest.mark.parametrize( diff --git a/tests/extras/datasets/geojson/test_geojson_dataset.py b/tests/extras/datasets/geojson/test_geojson_dataset.py index 6f9b6a1ffc..90df4e177d 100644 --- a/tests/extras/datasets/geojson/test_geojson_dataset.py +++ b/tests/extras/datasets/geojson/test_geojson_dataset.py @@ -53,7 +53,7 @@ def save_version(request): @pytest.fixture def filepath(tmp_path): - return str(tmp_path / "test.geojson") + return (tmp_path / "test.geojson").as_posix() @pytest.fixture(params=[None]) @@ -136,7 +136,7 @@ def test_open_extra_args(self, geojson_data_set, fs_args): assert geojson_data_set._fs_open_args_save == {"mode": "wb"} @pytest.mark.parametrize( - "filepath,instance_type", + "path,instance_type", [ ("s3://bucket/file.geojson", S3FileSystem), ("/tmp/test.geojson", LocalFileSystem), @@ -145,11 +145,11 @@ def test_open_extra_args(self, geojson_data_set, fs_args): ("https://example.com/file.geojson", HTTPFileSystem), ], ) - def test_protocol_usage(self, filepath, instance_type): - geojson_data_set = GeoJSONDataSet(filepath=filepath) + def test_protocol_usage(self, path, instance_type): + geojson_data_set = GeoJSONDataSet(filepath=path) assert isinstance(geojson_data_set._fs, instance_type) - path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + path = path.split(PROTOCOL_DELIMITER, 1)[-1] assert str(geojson_data_set._filepath) == path assert isinstance(geojson_data_set._filepath, PurePosixPath) diff --git a/tests/extras/datasets/holoviews/test_holoviews_writer.py b/tests/extras/datasets/holoviews/test_holoviews_writer.py index 3226c698fb..5e758169af 100644 --- a/tests/extras/datasets/holoviews/test_holoviews_writer.py +++ b/tests/extras/datasets/holoviews/test_holoviews_writer.py @@ -43,7 +43,7 @@ @pytest.fixture def filepath_png(tmp_path): - return str(tmp_path / "test.png") + return (tmp_path / "test.png").as_posix() @pytest.fixture(scope="module") @@ -66,7 +66,7 @@ def test_save_data(self, tmp_path, dummy_hv_object, hv_writer): """Test saving Holoviews object.""" hv_writer.save(dummy_hv_object) - actual_filepath = Path(hv_writer._filepath) + actual_filepath = Path(hv_writer._filepath.as_posix()) test_filepath = tmp_path / "locally_saved.png" hv.save(dummy_hv_object, test_filepath) @@ -206,7 +206,7 @@ def test_save_data(self, versioned_hv_writer, dummy_hv_object, tmp_path): versioned_hv_writer.save(dummy_hv_object) test_filepath = tmp_path / "test_image.png" - actual_filepath = Path(versioned_hv_writer._get_load_path()) + actual_filepath = Path(versioned_hv_writer._get_load_path().as_posix()) hv.save(dummy_hv_object, test_filepath) diff --git a/tests/extras/datasets/matplotlib/test_matplotlib_writer.py b/tests/extras/datasets/matplotlib/test_matplotlib_writer.py index 9b9d96428e..1a9a62edd7 100644 --- a/tests/extras/datasets/matplotlib/test_matplotlib_writer.py +++ b/tests/extras/datasets/matplotlib/test_matplotlib_writer.py @@ -28,7 +28,7 @@ import json -from pathlib import PosixPath +from pathlib import Path import matplotlib import matplotlib.pyplot as plt @@ -131,7 +131,7 @@ def plot_writer( @pytest.fixture def versioned_plot_writer(tmp_path, load_version, save_version): - filepath = str(tmp_path / "matplotlib.png") + filepath = (tmp_path / "matplotlib.png").as_posix() return MatplotlibWriter( filepath=filepath, version=Version(load_version, save_version) ) @@ -323,7 +323,7 @@ def test_save_data(self, versioned_plot_writer, mock_single_plot, tmp_path): versioned_plot_writer.save(mock_single_plot) test_path = tmp_path / "test_image.png" - actual_filepath = PosixPath(versioned_plot_writer._get_load_path()) + actual_filepath = Path(versioned_plot_writer._get_load_path().as_posix()) plt.savefig(str(test_path)) @@ -340,7 +340,7 @@ def test_list_save(self, tmp_path, mock_list_plot, versioned_plot_writer): versioned_filepath = str(versioned_plot_writer._get_load_path()) mock_list_plot[index].savefig(str(test_path)) - actual_filepath = PosixPath("{}/{}.png".format(versioned_filepath, index)) + actual_filepath = Path("{}/{}.png".format(versioned_filepath, index)) assert actual_filepath.read_bytes() == test_path.read_bytes() @@ -354,6 +354,6 @@ def test_dict_save(self, tmp_path, mock_dict_plot, versioned_plot_writer): versioned_filepath = str(versioned_plot_writer._get_load_path()) mock_dict_plot[colour].savefig(str(test_path)) - actual_filepath = PosixPath("{}/{}".format(versioned_filepath, colour)) + actual_filepath = Path("{}/{}".format(versioned_filepath, colour)) assert actual_filepath.read_bytes() == test_path.read_bytes() diff --git a/tests/extras/datasets/networkx/test_networkx_dataset.py b/tests/extras/datasets/networkx/test_networkx_dataset.py index 2e83a0ef4d..6980cd50fe 100644 --- a/tests/extras/datasets/networkx/test_networkx_dataset.py +++ b/tests/extras/datasets/networkx/test_networkx_dataset.py @@ -50,7 +50,7 @@ @pytest.fixture def filepath_json(tmp_path): - return str(tmp_path / "some_dir" / "test.json") + return (tmp_path / "some_dir" / "test.json").as_posix() @pytest.fixture diff --git a/tests/extras/datasets/pandas/test_appendable_excel_dataset.py b/tests/extras/datasets/pandas/test_appendable_excel_dataset.py index edf39affeb..0024be949b 100644 --- a/tests/extras/datasets/pandas/test_appendable_excel_dataset.py +++ b/tests/extras/datasets/pandas/test_appendable_excel_dataset.py @@ -36,7 +36,7 @@ @pytest.fixture def filepath(tmp_path): - return str(tmp_path / "test.xlsx") + return (tmp_path / "test.xlsx").as_posix() @pytest.fixture(scope="module") @@ -54,7 +54,7 @@ def setup_excel_dataset(path): @pytest.fixture def appendable_excel_dataset(filepath, save_args, load_args): return AppendableExcelDataSet( - filepath=str(filepath), load_args=load_args, save_args=save_args + filepath=filepath, load_args=load_args, save_args=save_args ) @@ -63,7 +63,7 @@ def test_save_and_load(self, dummy_dataframe, filepath): """Test saving and reloading the data set.""" excel_dataset, excel_df = setup_excel_dataset(filepath) appendable_excel_dataset = AppendableExcelDataSet( - filepath=str(filepath), + filepath=filepath, load_args={"sheet_name": "test"}, save_args={"sheet_name": "test"}, ) @@ -90,9 +90,9 @@ def test_save_and_load(self, dummy_dataframe, filepath): def test_exists(self, filepath): """Test `exists` method invocation for both existing and nonexistent data set.""" - appendable_excel_dataset = AppendableExcelDataSet(str(filepath)) + appendable_excel_dataset = AppendableExcelDataSet(filepath) assert not appendable_excel_dataset.exists() - setup_excel_dataset(str(filepath)) + setup_excel_dataset(filepath) assert appendable_excel_dataset.exists() @pytest.mark.parametrize( diff --git a/tests/extras/datasets/pandas/test_csv_dataset.py b/tests/extras/datasets/pandas/test_csv_dataset.py index ff39dd2694..f13f9ad09d 100644 --- a/tests/extras/datasets/pandas/test_csv_dataset.py +++ b/tests/extras/datasets/pandas/test_csv_dataset.py @@ -27,6 +27,7 @@ # limitations under the License. from pathlib import PurePosixPath +from time import sleep import pandas as pd import pytest @@ -44,7 +45,7 @@ @pytest.fixture def filepath_csv(tmp_path): - return str(tmp_path / "test.csv") + return (tmp_path / "test.csv").as_posix() @pytest.fixture @@ -184,6 +185,7 @@ def test_multiple_loads( versioned_csv_data_set.load() v1 = versioned_csv_data_set.resolve_load_version() + sleep(0.5) # force-drop a newer version into the same location v_new = generate_timestamp() CSVDataSet(filepath=filepath_csv, version=Version(v_new, v_new)).save( @@ -210,6 +212,7 @@ def test_multiple_saves(self, dummy_dataframe, filepath_csv): assert first_load_version == first_save_version # second save + sleep(0.5) ds_versioned.save(dummy_dataframe) second_save_version = ds_versioned.resolve_save_version() second_load_version = ds_versioned.resolve_load_version() diff --git a/tests/extras/datasets/pandas/test_excel_dataset.py b/tests/extras/datasets/pandas/test_excel_dataset.py index cfc7656b38..fce1683818 100644 --- a/tests/extras/datasets/pandas/test_excel_dataset.py +++ b/tests/extras/datasets/pandas/test_excel_dataset.py @@ -43,7 +43,7 @@ @pytest.fixture def filepath_excel(tmp_path): - return str(tmp_path / "test.xlsx") + return (tmp_path / "test.xlsx").as_posix() @pytest.fixture diff --git a/tests/extras/datasets/pandas/test_feather_dataset.py b/tests/extras/datasets/pandas/test_feather_dataset.py index b86a618c78..12a6519bf7 100644 --- a/tests/extras/datasets/pandas/test_feather_dataset.py +++ b/tests/extras/datasets/pandas/test_feather_dataset.py @@ -43,7 +43,7 @@ @pytest.fixture def filepath_feather(tmp_path): - return str(tmp_path / "test.feather") + return (tmp_path / "test.feather").as_posix() @pytest.fixture diff --git a/tests/extras/datasets/pandas/test_hdf_dataset.py b/tests/extras/datasets/pandas/test_hdf_dataset.py index 3625f683b6..272af4456f 100644 --- a/tests/extras/datasets/pandas/test_hdf_dataset.py +++ b/tests/extras/datasets/pandas/test_hdf_dataset.py @@ -45,7 +45,7 @@ @pytest.fixture def filepath_hdf(tmp_path): - return str(tmp_path / "test.h5") + return (tmp_path / "test.h5").as_posix() @pytest.fixture diff --git a/tests/extras/datasets/pandas/test_json_dataset.py b/tests/extras/datasets/pandas/test_json_dataset.py index 5d419c1cd5..fe5f22dcc8 100644 --- a/tests/extras/datasets/pandas/test_json_dataset.py +++ b/tests/extras/datasets/pandas/test_json_dataset.py @@ -44,7 +44,7 @@ @pytest.fixture def filepath_json(tmp_path): - return str(tmp_path / "test.json") + return (tmp_path / "test.json").as_posix() @pytest.fixture diff --git a/tests/extras/datasets/pandas/test_parquet_dataset.py b/tests/extras/datasets/pandas/test_parquet_dataset.py index 7e3ccd1c4d..e1d0a875ae 100644 --- a/tests/extras/datasets/pandas/test_parquet_dataset.py +++ b/tests/extras/datasets/pandas/test_parquet_dataset.py @@ -47,7 +47,7 @@ @pytest.fixture def filepath_parquet(tmp_path): - return str(tmp_path / FILENAME) + return (tmp_path / FILENAME).as_posix() @pytest.fixture @@ -84,7 +84,7 @@ def test_credentials_propagated(self, mocker): def test_save_and_load(self, tmp_path, dummy_dataframe): """Test saving and reloading the data set.""" - filepath = str(tmp_path / FILENAME) + filepath = (tmp_path / FILENAME).as_posix() data_set = ParquetDataSet(filepath=filepath) data_set.save(dummy_dataframe) reloaded = data_set.load() @@ -170,7 +170,7 @@ def test_read_partitioned_file(self, mocker, tmp_path, dummy_dataframe): "pyarrow.parquet.ParquetDataset", wraps=pq.ParquetDataset ) dummy_dataframe.to_parquet(str(tmp_path), partition_cols=["col2"]) - data_set = ParquetDataSet(filepath=str(tmp_path)) + data_set = ParquetDataSet(filepath=tmp_path.as_posix()) reloaded = data_set.load() # Sort by columns because reading partitioned file results @@ -183,7 +183,7 @@ def test_read_partitioned_file(self, mocker, tmp_path, dummy_dataframe): pq_ds_mock.assert_called_once() def test_write_to_dir(self, dummy_dataframe, tmp_path): - data_set = ParquetDataSet(filepath=str(tmp_path)) + data_set = ParquetDataSet(filepath=tmp_path.as_posix()) pattern = "Saving ParquetDataSet to a directory is not supported" with pytest.raises(DataSetError, match=pattern): diff --git a/tests/extras/datasets/pickle/test_pickle_dataset.py b/tests/extras/datasets/pickle/test_pickle_dataset.py index b13ac847f2..1fbe9a9af5 100644 --- a/tests/extras/datasets/pickle/test_pickle_dataset.py +++ b/tests/extras/datasets/pickle/test_pickle_dataset.py @@ -44,7 +44,7 @@ @pytest.fixture def filepath_pickle(tmp_path): - return str(tmp_path / "test.pkl") + return (tmp_path / "test.pkl").as_posix() @pytest.fixture diff --git a/tests/extras/datasets/pillow/test_image_dataset.py b/tests/extras/datasets/pillow/test_image_dataset.py index bcb6ad4c63..ff1a0b6296 100644 --- a/tests/extras/datasets/pillow/test_image_dataset.py +++ b/tests/extras/datasets/pillow/test_image_dataset.py @@ -27,6 +27,7 @@ # limitations under the License. from pathlib import Path, PurePosixPath +from time import sleep import pytest from fsspec.implementations.http import HTTPFileSystem @@ -41,7 +42,7 @@ @pytest.fixture def filepath_png(tmp_path): - return str(tmp_path / "test.png") + return (tmp_path / "test.png").as_posix() @pytest.fixture @@ -173,6 +174,9 @@ def test_multiple_loads(self, versioned_image_dataset, image_object, filepath_pn versioned_image_dataset.save(image_object) v1 = versioned_image_dataset.resolve_load_version() + # Sometimes for some reason `v1 == v_new` on Windows. + # `sleep()` was added to fix this. + sleep(0.5) # force-drop a newer version into the same location v_new = generate_timestamp() ImageDataSet(filepath=filepath_png, version=Version(v_new, v_new)).save( diff --git a/tests/extras/datasets/spark/test_spark_dataset.py b/tests/extras/datasets/spark/test_spark_dataset.py index cdb2484536..890dbaa86b 100644 --- a/tests/extras/datasets/spark/test_spark_dataset.py +++ b/tests/extras/datasets/spark/test_spark_dataset.py @@ -28,7 +28,7 @@ # pylint: disable=import-error import tempfile -from pathlib import Path, PurePosixPath, PureWindowsPath +from pathlib import Path, PurePosixPath import pandas as pd import pytest @@ -98,12 +98,14 @@ def version(): @pytest.fixture def versioned_dataset_local(tmp_path, version): - return SparkDataSet(filepath=str(tmp_path / FILENAME), version=version) + return SparkDataSet(filepath=(tmp_path / FILENAME).as_posix(), version=version) @pytest.fixture def versioned_dataset_dbfs(tmp_path, version): - return SparkDataSet(filepath="/dbfs" + str(tmp_path / FILENAME), version=version) + return SparkDataSet( + filepath="/dbfs" + (tmp_path / FILENAME).as_posix(), version=version + ) @pytest.fixture @@ -135,7 +137,7 @@ def identity(arg): @pytest.fixture def spark_in(tmp_path, sample_spark_df): - spark_in = SparkDataSet(filepath=str(tmp_path / "input")) + spark_in = SparkDataSet(filepath=(tmp_path / "input").as_posix()) spark_in.save(sample_spark_df) return spark_in @@ -150,7 +152,7 @@ def isDir(self): class TestSparkDataSet: def test_load_parquet(self, tmp_path, sample_pandas_df): - temp_path = str(tmp_path / "data") + temp_path = (tmp_path / "data").as_posix() local_parquet_set = ParquetDataSet(filepath=temp_path) local_parquet_set.save(sample_pandas_df) spark_data_set = SparkDataSet(filepath=temp_path) @@ -163,7 +165,7 @@ def test_save_parquet(self, tmp_path, sample_spark_df): # ParquetDataSet temp_dir = Path(str(tmp_path / "test_data")) spark_data_set = SparkDataSet( - filepath=str(temp_dir), save_args={"compression": "none"} + filepath=temp_dir.as_posix(), save_args={"compression": "none"} ) spark_df = sample_spark_df.coalesce(1) spark_data_set.save(spark_df) @@ -172,14 +174,14 @@ def test_save_parquet(self, tmp_path, sample_spark_df): f for f in temp_dir.iterdir() if f.is_file() and f.name.startswith("part") ][0] - local_parquet_data_set = ParquetDataSet(filepath=str(single_parquet)) + local_parquet_data_set = ParquetDataSet(filepath=single_parquet.as_posix()) pandas_df = local_parquet_data_set.load() assert pandas_df[pandas_df["name"] == "Bob"]["age"].iloc[0] == 12 def test_load_options_csv(self, tmp_path, sample_pandas_df): - filepath = str(tmp_path / "data") + filepath = (tmp_path / "data").as_posix() local_csv_data_set = CSVDataSet(filepath=filepath) local_csv_data_set.save(sample_pandas_df) spark_data_set = SparkDataSet( @@ -194,7 +196,7 @@ def test_save_options_csv(self, tmp_path, sample_spark_df): # CSVDataSet temp_dir = Path(str(tmp_path / "test_data")) spark_data_set = SparkDataSet( - filepath=str(temp_dir), + filepath=temp_dir.as_posix(), file_format="csv", save_args={"sep": "|", "header": True}, ) @@ -206,7 +208,7 @@ def test_save_options_csv(self, tmp_path, sample_spark_df): ][0] csv_local_data_set = CSVDataSet( - filepath=str(single_csv_file), load_args={"sep": "|"} + filepath=single_csv_file.as_posix(), load_args={"sep": "|"} ) pandas_df = csv_local_data_set.load() @@ -214,17 +216,16 @@ def test_save_options_csv(self, tmp_path, sample_spark_df): def test_str_representation(self): with tempfile.NamedTemporaryFile() as temp_data_file: + filepath = Path(temp_data_file.name).as_posix() spark_data_set = SparkDataSet( - filepath=temp_data_file.name, - file_format="csv", - load_args={"header": True}, + filepath=filepath, file_format="csv", load_args={"header": True}, ) assert "SparkDataSet" in str(spark_data_set) - assert "filepath={}".format(temp_data_file.name) in str(spark_data_set) + assert "filepath={}".format(filepath) in str(spark_data_set) def test_save_overwrite_fail(self, tmp_path, sample_spark_df): # Writes a data frame twice and expects it to fail. - filepath = str(tmp_path / "test_data") + filepath = (tmp_path / "test_data").as_posix() spark_data_set = SparkDataSet(filepath=filepath) spark_data_set.save(sample_spark_df) @@ -233,7 +234,7 @@ def test_save_overwrite_fail(self, tmp_path, sample_spark_df): def test_save_overwrite_mode(self, tmp_path, sample_spark_df): # Writes a data frame in overwrite mode. - filepath = str(tmp_path / "test_data") + filepath = (tmp_path / "test_data").as_posix() spark_data_set = SparkDataSet( filepath=filepath, save_args={"mode": "overwrite"} ) @@ -248,7 +249,7 @@ def test_save_partition(self, tmp_path, sample_spark_df): filepath = Path(str(tmp_path / "test_data")) spark_data_set = SparkDataSet( - filepath=str(filepath), + filepath=filepath.as_posix(), save_args={"mode": "overwrite", "partitionBy": ["name"]}, ) @@ -260,7 +261,7 @@ def test_save_partition(self, tmp_path, sample_spark_df): @pytest.mark.parametrize("file_format", ["csv", "parquet"]) def test_exists(self, file_format, tmp_path, sample_spark_df): - filepath = str(tmp_path / "test_data") + filepath = (tmp_path / "test_data").as_posix() spark_data_set = SparkDataSet(filepath=filepath, file_format=file_format) assert not spark_data_set.exists() @@ -311,7 +312,7 @@ def test_load_latest(self, versioned_dataset_local, sample_spark_df): def test_load_exact(self, tmp_path, sample_spark_df): ts = generate_timestamp() ds_local = SparkDataSet( - filepath=str(tmp_path / FILENAME), version=Version(ts, ts) + filepath=(tmp_path / FILENAME).as_posix(), version=Version(ts, ts) ) ds_local.save(sample_spark_df) @@ -328,13 +329,13 @@ def test_repr(self, versioned_dataset_local, tmp_path, version): versioned_dataset_local ) - dataset_local = SparkDataSet(filepath=str(tmp_path / FILENAME)) + dataset_local = SparkDataSet(filepath=(tmp_path / FILENAME).as_posix()) assert "version=" not in str(dataset_local) def test_save_version_warning(self, tmp_path, sample_spark_df): exact_version = Version("2019-01-01T23.59.59.999Z", "2019-01-02T00.00.00.000Z") ds_local = SparkDataSet( - filepath=str(tmp_path / FILENAME), version=exact_version + filepath=(tmp_path / FILENAME).as_posix(), version=exact_version ) pattern = ( @@ -346,7 +347,7 @@ def test_save_version_warning(self, tmp_path, sample_spark_df): def test_prevent_overwrite(self, tmp_path, version, sample_spark_df): versioned_local = SparkDataSet( - filepath=str(tmp_path / FILENAME), + filepath=(tmp_path / FILENAME).as_posix(), version=version, # second save should fail even in overwrite mode save_args={"mode": "overwrite"}, @@ -488,23 +489,19 @@ def test_get_dbutils_no_modules(self, mocker): mocker.patch.dict("sys.modules", {"pyspark": None, "IPython": None}) assert _get_dbutils("spark") is None - @pytest.mark.parametrize( - "os_name,path_class", [("nt", PureWindowsPath), ("posix", PurePosixPath)] - ) - def test_regular_path_in_different_os(self, os_name, path_class, mocker): + @pytest.mark.parametrize("os_name", ["nt", "posix"]) + def test_regular_path_in_different_os(self, os_name, mocker): """Check that class of filepath depends on OS for regular path.""" mocker.patch("os.name", os_name) data_set = SparkDataSet(filepath="/some/path") - assert isinstance(data_set._filepath, path_class) + assert isinstance(data_set._filepath, PurePosixPath) - @pytest.mark.parametrize( - "os_name,path_class", [("nt", PurePosixPath), ("posix", PurePosixPath)] - ) - def test_dbfs_path_in_different_os(self, os_name, path_class, mocker): + @pytest.mark.parametrize("os_name", ["nt", "posix"]) + def test_dbfs_path_in_different_os(self, os_name, mocker): """Check that class of filepath doesn't depend on OS if it references DBFS.""" mocker.patch("os.name", os_name) data_set = SparkDataSet(filepath="/dbfs/some/path") - assert isinstance(data_set._filepath, path_class) + assert isinstance(data_set._filepath, PurePosixPath) class TestSparkDataSetVersionedS3: @@ -776,10 +773,10 @@ def test_repr(self, version): @pytest.fixture def data_catalog(tmp_path): - source_path = str(Path(__file__).parent / "data/test.parquet") - spark_in = SparkDataSet(source_path) - spark_out = SparkDataSet(str(tmp_path / "spark_data")) - pickle_ds = PickleDataSet(str(tmp_path / "pickle/test.pkl")) + source_path = Path(__file__).parent / "data/test.parquet" + spark_in = SparkDataSet(source_path.as_posix()) + spark_out = SparkDataSet((tmp_path / "spark_data").as_posix()) + pickle_ds = PickleDataSet((tmp_path / "pickle/test.pkl").as_posix()) return DataCatalog( {"spark_in": spark_in, "spark_out": spark_out, "pickle_ds": pickle_ds} @@ -793,7 +790,7 @@ def test_spark_load_save(self, is_async, data_catalog): pipeline = Pipeline([node(identity, "spark_in", "spark_out")]) SequentialRunner(is_async=is_async).run(pipeline, data_catalog) - save_path = Path(data_catalog._data_sets["spark_out"]._filepath) + save_path = Path(data_catalog._data_sets["spark_out"]._filepath.as_posix()) files = list(save_path.glob("*.parquet")) assert len(files) > 0 @@ -815,6 +812,6 @@ def test_spark_memory_spark(self, is_async, data_catalog): ) SequentialRunner(is_async=is_async).run(pipeline, data_catalog) - save_path = Path(data_catalog._data_sets["spark_out"]._filepath) + save_path = Path(data_catalog._data_sets["spark_out"]._filepath.as_posix()) files = list(save_path.glob("*.parquet")) assert len(files) > 0 diff --git a/tests/extras/datasets/tensorflow/test_tensorflow_model_dataset.py b/tests/extras/datasets/tensorflow/test_tensorflow_model_dataset.py index 1386212996..d580f41ad6 100644 --- a/tests/extras/datasets/tensorflow/test_tensorflow_model_dataset.py +++ b/tests/extras/datasets/tensorflow/test_tensorflow_model_dataset.py @@ -49,7 +49,7 @@ @pytest.fixture def filepath(tmp_path): - return str(tmp_path / "test_tf") + return (tmp_path / "test_tf").as_posix() @pytest.fixture diff --git a/tests/extras/datasets/text/test_text_dataset.py b/tests/extras/datasets/text/test_text_dataset.py index 3557dac2f9..7fcd63c6ef 100644 --- a/tests/extras/datasets/text/test_text_dataset.py +++ b/tests/extras/datasets/text/test_text_dataset.py @@ -43,7 +43,7 @@ @pytest.fixture def filepath_txt(tmp_path): - return str(tmp_path / "test.txt") + return (tmp_path / "test.txt").as_posix() @pytest.fixture diff --git a/tests/extras/datasets/yaml/test_yaml_dataset.py b/tests/extras/datasets/yaml/test_yaml_dataset.py index 5ddc762bf3..3e5f7ee41c 100644 --- a/tests/extras/datasets/yaml/test_yaml_dataset.py +++ b/tests/extras/datasets/yaml/test_yaml_dataset.py @@ -43,7 +43,7 @@ @pytest.fixture def filepath_yaml(tmp_path): - return str(tmp_path / "test.yaml") + return (tmp_path / "test.yaml").as_posix() @pytest.fixture diff --git a/tests/framework/context/test_context.py b/tests/framework/context/test_context.py index 963e9fd158..75c72a2c62 100644 --- a/tests/framework/context/test_context.py +++ b/tests/framework/context/test_context.py @@ -29,7 +29,8 @@ import configparser import json import re -from pathlib import Path, PurePosixPath, PureWindowsPath +from pathlib import Path, PurePath, PurePosixPath, PureWindowsPath +from time import sleep from typing import Any, Dict import pandas as pd @@ -46,7 +47,7 @@ validate_source_path, ) from kedro.framework.context.context import ( - _expand_path, + _convert_paths_to_absolute_posix, _is_relative_path, _validate_layers_for_transcoding, ) @@ -105,8 +106,8 @@ def _write_dummy_ini(filepath: Path): @pytest.fixture def base_config(tmp_path): - cars_filepath = str(tmp_path / "cars.csv") - trains_filepath = str(tmp_path / "trains.csv") + cars_filepath = (tmp_path / "cars.csv").as_posix() + trains_filepath = (tmp_path / "trains.csv").as_posix() return { "trains": {"type": "pandas.CSVDataSet", "filepath": trains_filepath}, @@ -120,8 +121,8 @@ def base_config(tmp_path): @pytest.fixture def local_config(tmp_path): - cars_filepath = str(tmp_path / "cars.csv") - boats_filepath = str(tmp_path / "boats.csv") + cars_filepath = (tmp_path / "cars.csv").as_posix() + boats_filepath = (tmp_path / "boats.csv").as_posix() # use one dataset with a relative filepath horses_filepath = "horses.csv" return { @@ -289,8 +290,11 @@ def test_get_catalog_always_using_absolute_path(self, dummy_context): # based on the project path catalog = dummy_context._get_catalog() ds_path = catalog._data_sets["horses"]._filepath - assert ds_path.is_absolute() - assert str(ds_path) == str(dummy_context._project_path / "horses.csv") + assert PurePath(ds_path.as_posix()).is_absolute() + assert ( + ds_path.as_posix() + == (dummy_context._project_path / "horses.csv").as_posix() + ) def test_get_catalog_validates_layers(self, dummy_context, mocker): mock_validate = mocker.patch( @@ -432,8 +436,9 @@ def test_setup_logging_using_absolute_path(self, tmp_path, mocker): mocked_dict_config = mocker.patch("logging.config.dictConfig") dummy_context = DummyContext(str(tmp_path)) called_args = mocked_dict_config.call_args[0][0] - assert called_args["info_file_handler"]["filename"] == str( - dummy_context._project_path / "logs" / "info.log" + assert ( + called_args["info_file_handler"]["filename"] + == (dummy_context._project_path / "logs" / "info.log").as_posix() ) @@ -571,7 +576,7 @@ def _get_pipelines(self) -> Dict[str, Pipeline]: mocker.patch("logging.config.dictConfig") dummy_context = DummyContext(str(tmp_path)) - filepath = str(dummy_context.project_path / "cars.csv") + filepath = (dummy_context.project_path / "cars.csv").as_posix() old_save_version = generate_timestamp() old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) @@ -582,6 +587,7 @@ def _get_pipelines(self) -> Dict[str, Pipeline]: ) old_csv_data_set.save(old_df) + sleep(0.5) new_save_version = generate_timestamp() new_csv_data_set = CSVDataSet( filepath=filepath, @@ -795,10 +801,10 @@ def test_is_relative_path(path_string: str, expected: bool): assert _is_relative_path(path_string) == expected -def test_expand_path_raises_value_error_on_relative_project_path(): +def test_convert_paths_raises_error_on_relative_project_path(): path = Path("relative/path") with pytest.raises(ValueError) as excinfo: - _expand_path(project_path=path, conf_dictionary={}) + _convert_paths_to_absolute_posix(project_path=path, conf_dictionary={}) assert ( str(excinfo.value) == f"project_path must be an absolute path. Received: {path}" @@ -820,8 +826,8 @@ def test_expand_path_raises_value_error_on_relative_project_path(): ), ( PureWindowsPath("C:\\kedro"), - {"my_dataset": {"path": "data\\01_raw\\dataset.json"}}, - {"my_dataset": {"path": "C:\\kedro\\data\\01_raw\\dataset.json"}}, + {"my_dataset": {"path": "data/01_raw/dataset.json"}}, + {"my_dataset": {"path": "C:/kedro/data/01_raw/dataset.json"}}, ), # test: the function shouldn't modify paths for key not associated with filepath ( @@ -831,10 +837,10 @@ def test_expand_path_raises_value_error_on_relative_project_path(): ), ], ) -def test_expand_path_for_all_known_filepath_keys( +def test_convert_paths_to_absolute_posix_for_all_known_filepath_keys( project_path: Path, input_conf: Dict[str, Any], expected: Dict[str, Any] ): - assert _expand_path(project_path, input_conf) == expected + assert _convert_paths_to_absolute_posix(project_path, input_conf) == expected @pytest.mark.parametrize( @@ -850,17 +856,28 @@ def test_expand_path_for_all_known_filepath_keys( {"my_dataset": {"filepath": "s3://data/01_raw/dataset.json"}}, {"my_dataset": {"filepath": "s3://data/01_raw/dataset.json"}}, ), + ], +) +def test_convert_paths_to_absolute_posix_not_changing_non_relative_path( + project_path: Path, input_conf: Dict[str, Any], expected: Dict[str, Any] +): + assert _convert_paths_to_absolute_posix(project_path, input_conf) == expected + + +@pytest.mark.parametrize( + "project_path,input_conf,expected", + [ ( PureWindowsPath("D:\\kedro"), - {"my_dataset": {"path": "C:\\data\\01_raw\\dataset.json"}}, - {"my_dataset": {"path": "C:\\data\\01_raw\\dataset.json"}}, + {"my_dataset": {"path": r"C:\data\01_raw\dataset.json"}}, + {"my_dataset": {"path": "C:/data/01_raw/dataset.json"}}, ), ], ) -def test_expand_path_not_changing_non_relative_path( +def test_convert_paths_to_absolute_posix_converts_full_windows_path_to_posix( project_path: Path, input_conf: Dict[str, Any], expected: Dict[str, Any] ): - assert _expand_path(project_path, input_conf) == expected + assert _convert_paths_to_absolute_posix(project_path, input_conf) == expected @pytest.mark.parametrize( diff --git a/tests/framework/hooks/test_context_hooks.py b/tests/framework/hooks/test_context_hooks.py index 2e84694ca1..32906dfdcc 100644 --- a/tests/framework/hooks/test_context_hooks.py +++ b/tests/framework/hooks/test_context_hooks.py @@ -37,7 +37,7 @@ from kedro import __version__ from kedro.framework.context import KedroContext -from kedro.framework.context.context import _expand_path +from kedro.framework.context.context import _convert_paths_to_absolute_posix from kedro.framework.hooks import hook_impl from kedro.framework.hooks.manager import _create_hook_manager from kedro.io import DataCatalog @@ -369,7 +369,7 @@ def test_after_catalog_created_hook_is_called(self, context_with_hooks, caplog): assert record.getMessage() == "Catalog created" assert record.catalog == catalog assert record.conf_creds == config_loader.get("credentials*") - assert record.conf_catalog == _expand_path( + assert record.conf_catalog == _convert_paths_to_absolute_posix( project_path=context_with_hooks.project_path, conf_dictionary=config_loader.get("catalog*"), ) diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index d49d262a71..c1055ec65e 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -52,7 +52,7 @@ @pytest.fixture def filepath(tmp_path): - return str(tmp_path / "some" / "dir" / "test.csv") + return (tmp_path / "some" / "dir" / "test.csv").as_posix() @pytest.fixture