kedro-org · samuel-lee-sj · Oct 11, 2023 · Oct 12, 2023 · Oct 12, 2023 · Oct 17, 2023
diff --git a/kedro-airflow/features/steps/cli_steps.py b/kedro-airflow/features/steps/cli_steps.py
@@ -93,7 +93,6 @@ def create_project_from_config_file(context):
             "-c",
             str(context.config_file),
             "--starter",
-            "astro-airflow-iris",
         ],
         env=context.env,
         cwd=str(context.temp_dir),

diff --git a/kedro-airflow/pyproject.toml b/kedro-airflow/pyproject.toml
@@ -24,12 +24,12 @@ Tracker = "https://github.com/kedro-org/kedro-plugins/issues"
 
 [project.optional-dependencies]
 test = [
-    "apache-airflow<3.0",
     "bandit",
     "behave",
     "black~=22.0",
     "connexion<3.0.0", # TODO: Temporary fix, connexion has changed their API, but airflow hasn't caught up yet
     "kedro-datasets",
+    "pendulum<3.0.0", # TODO: Also to be removed
     "pre-commit>=2.9.2",
     "pytest",
     "pytest-cov",

diff --git a/kedro-datasets/docs/source/conf.py b/kedro-datasets/docs/source/conf.py
@@ -25,7 +25,7 @@
 
 # -- Project information -----------------------------------------------------
 
-project = "kedro-datasets"
+project = "kedro"
 author = "kedro"
 
 # The short X.Y version.
@@ -99,7 +99,6 @@
     "py:class": (
         "kedro.io.core.AbstractDataset",
         "kedro.io.AbstractDataset",
-        "AbstractDataset",
         "kedro.io.core.Version",
         "requests.auth.AuthBase",
         "google.oauth2.credentials.Credentials",

diff --git a/kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py b/kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py
@@ -31,8 +31,6 @@ class HFDataset(AbstractVersionedDataset):
        >>> assert len(yelp_review_full["train"]) == 650000
 
     """
-
-    def __init__(self, *, dataset_name: str):
         self.dataset_name = dataset_name
 
     def _load(self):

diff --git a/kedro-datasets/kedro_datasets/huggingface/transformer_pipeline_dataset.py b/kedro-datasets/kedro_datasets/huggingface/transformer_pipeline_dataset.py
@@ -37,7 +37,6 @@ class HFTransformerPipelineDataset(AbstractDataset):
 
     def __init__(
         self,
-        *,
         task: str | None = None,
         model_name: str | None = None,
         pipeline_kwargs: dict[t.Any] | None = None,

diff --git a/kedro-datasets/kedro_datasets/matlab/__init__.py b/kedro-datasets/kedro_datasets/matlab/__init__.py
@@ -0,0 +1,13 @@
+"""``AbstractDataset`` implementation to load/save data from/to a Matlab file."""
+from __future__ import annotations
+
+from typing import Any
+
+import lazy_loader as lazy
+
+MatlabDataSet: type[MatlabDataset]
+MatlabDataset: Any
+
+__getattr__, __dir__, __all__ = lazy.attach(
+    __name__, submod_attrs={"matlab_dataset": ["MatlabDataSet", "MatlabDataset"]}
+)
diff --git a/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py b/kedro-datasets/kedro_datasets/matlab/matlab_dataset.py
@@ -0,0 +1,162 @@
+"""``MatlabDataset`` loads/saves data from/to a Matlab file using an underlying
+filesystem ?(e.g.: local, S3, GCS)?. The underlying functionality is supported by
+the specified backend library passed in (defaults to the ``matlab`` library), so it
+supports all allowed options for loading and saving matlab files.
+"""
+import warnings
+from copy import deepcopy
+from pathlib import PurePosixPath
+from typing import Any, Dict
+
+import fsspec
+import numpy as np
+from kedro.io.core import Version, get_filepath_str, get_protocol_and_path
+from scipy import io
+
+from kedro_datasets import KedroDeprecationWarning
+from kedro_datasets._io import AbstractVersionedDataset, DatasetError
+
+
+class MatlabDataset(AbstractVersionedDataset[np.ndarray, np.ndarray]):
+    """`MatlabDataSet` loads and saves data from/to a MATLAB file using scipy.io.
+
+    Example usage for the
+    `YAML API <https://kedro.readthedocs.io/en/stable/data/\
+    data_catalog_yaml_examples.html>`_:
+
+    .. code-block:: yaml
+        cars:
+            type: mat.MatlabDataset
+            filepath: gcs://your_bucket/cars.mat
+            fs_args:
+                project: my-project
+            credentials: my_gcp_credentials
+
+    Example usage for the
+    `Python API <https://kedro.readthedocs.io/en/stable/data/\
+    advanced_data_catalog_usage.html>`_:
+
+    .. code-block:: pycon
+        >>> from kedro_datasets.matlab import MatlabDataset
+        >>> data = {"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}
+        >>> dataset = MatlabDataset(filepath='my_data.mat')
+        >>> dataset.save(data)
+        >>> reloaded = dataset.load()
+        >>> assert data == reloaded
+    """
+    DEFAULT_SAVE_ARGS: Dict[str, Any] = {"indent": 2}
+    def __init__(    # noqa = PLR0913
+            self,
+            filepath: str,
+            save_args: Dict[str, Any]=None,
+            version: Version =None,
+            credentials: Dict[str, Any] =None,
+            fs_args: Dict[str, Any]=None,
+            metadata: Dict[str, Any]=None)->None:
+        """Creates a new instance of MatlabDataSet to load and save data from/to a MATLAB file.
+
+        Args:
+            filepath: Filepath in POSIX format to a Matlab file prefixed with a protocol like `s3://`.
+                If prefix is not provided, `file` protocol (local filesystem) will be used.
+                The prefix should be any protocol supported by ``fsspec``.
+                Note: `http(s)` doesn't support versioning.
+            save_args: .mat options for saving .mat files.
+            version: If specified, should be an instance of
+                ``kedro.io.core.Version``. If its ``load`` attribute is
+                None, the latest version will be loaded. If its ``save``
+                attribute is None, save version will be autogenerated.
+            credentials: Credentials required to get access to the underlying filesystem.
+                E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
+            fs_args: Extra arguments to pass into underlying filesystem class constructor
+                (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as
+                to pass to the filesystem's `open` method through nested keys
+                `open_args_load` and `open_args_save`.
+                Here you can find all available arguments for `open`:
+                https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
+                All defaults are preserved, except `mode`, which is set to `r` when loading
+                and to `w` when saving.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
+        """
+        _fs_args = deepcopy(fs_args) or {}
+        _fs_open_args_load = _fs_args.pop("open_args_load", {})
+        _fs_open_args_save = _fs_args.pop("open_args_save", {})
+        _credentials = deepcopy(credentials) or {}
+
+        protocol, path = get_protocol_and_path(filepath, version)
+        self._protocol = protocol
+        if protocol == "file":
+            _fs_args.setdefault("auto_mkdir", True)
+        self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
+        self.metadata = metadata
+
+        super().__init__(
+            filepath=PurePosixPath(path),
+            version=version,
+            exists_function=self._fs.exists,
+            glob_function=self._fs.glob,
+        )
+        # Handle default save arguements
+        self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS)
+        if save_args is not None:
+            self._save_args.update(save_args)
+
+        _fs_open_args_save.setdefault("mode", "w")
+        self._fs_open_args_load = _fs_open_args_load
+        self._fs_open_args_save = _fs_open_args_save
+
+    def _describe(self) -> Dict[str, Any]:
+        return {
+            "filepath": self._filepath,
+            "protocol": self._protocol,
+            "save_args": self._save_args,
+            "version": self._version,
+        }
+
+    def _load(self) -> np.ndarray:
+        '''
+        Access the specific variable in the .mat file, e.g, data['variable_name']
+        '''
+        load_path = get_filepath_str(self._filepath, self._protocol)
+        with self._fs.open(load_path, mode="rb") as f:
+            data = io.loadmat(f)
+            return data
+
+    def _save(self, data: np.ndarray) -> None:
+        save_path = get_filepath_str(self._filepath, self._protocol)
+        with self._fs.open(save_path, mode="wb") as f:
+            io.savemat(f, {'data': data})
+        self._invalidate_cache()
+
+    def _exists(self)-> bool:
+        try:
+            load_path = get_filepath_str(self._get_load_path(), self._protocol)
+        except DatasetError:
+            return False
+
+        return self._fs.exists(load_path)
+
+    def _release(self) -> None:
+        super()._release()
+        self._invalidate_cache()
+
+    def _invalidate_cache(self) -> None:
+        """Invalidate underlying filesystem caches."""
+        filepath = get_filepath_str(self._filepath, self._protocol)
+        self._fs._invalidate_cache(filepath)
+
+_DEPRECATED_CLASSES = {
+    "MatlabDataSet": MatlabDataset
+}
+
+def __getattr__(name):
+    if name in _DEPRECATED_CLASSES:
+        alias = _DEPRECATED_CLASSES[name]
+        warnings.warn(
+            f"{repr(name)} has been renamed to {repr(alias.__name__)}, "
+            f"and the alias will be removed in Kedro-Datasets 2.0.0",
+            KedroDeprecationWarning,
+            stacklevel=2,
+        )
+        return alias
+    raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}")
diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py
@@ -83,6 +83,12 @@ def _collect_requirements(requires):
         "pyarrow>=4.0",
         "deltalake >= 0.6.2",
     ],
+    "polars.LazyPolarsDataset": [
+        # Note: there is no Lazy read Excel option, so we exclude xlsx2csv here.
+        POLARS,
+        "pyarrow>=4.0",
+        "deltalake >= 0.6.2",
+    ],
 }
 redis_require = {"redis.PickleDataset": ["redis~=4.1"]}
 snowflake_require = {

diff --git a/kedro-datasets/tests/matlab/__init__.py b/kedro-datasets/tests/matlab/__init__.py