Skip to content

Commit

Permalink
feat: networkx datasets gml and graphml
Browse files Browse the repository at this point in the history
  • Loading branch information
sbrugman committed Sep 9, 2021
1 parent f038e97 commit 814d932
Show file tree
Hide file tree
Showing 9 changed files with 856 additions and 75 deletions.
6 changes: 6 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
| Type | Description | Location |
| ------------------- | -------------------------------------------------------------- | ------------------------------ |
| `pandas.XMLDataSet` | Read XML into Pandas DataFrame. Write Pandas DataFrame to XML. | `kedro.extras.datasets.pandas` |
| `networkx.GraphMLDataSet` | Work with NetworkX using GraphML files | `kedro.extras.datasets.networkx` |
| `networkx.GMLDataSet` | Work with NetworkX using Graph Modelling Language files | `kedro.extras.datasets.networkx` |

## Breaking changes to the API
* Add namespace to parameters in a modular pipeline, which addresses [Issue 399](https://github.com/quantumblacklabs/kedro/issues/399)
Expand Down Expand Up @@ -54,9 +56,11 @@
* `kedro pipeline package` now accepts a module path to the pipeline or utility module to package, relative to the `src/<package_name>/`.
* Renamed `lambda_data_set`, `memory_data_set`, and `partitioned_data_set` to `lambda_dataset`, `memory_dataset`, and `partitioned_dataset`, respectively, in `kedro.io`.
* Removed the `kedro install` command in favour of using `pip install -r src/requirements.txt` to install project dependencies.
* The dataset `networkx.NetworkXDataSet` has been renamed to `networkx.JSONDataSet`.

## Thanks for supporting contributions
[Deepyaman Datta](https://github.com/deepyaman)
[Simon Brugman](https://github.com/sbrugman)

## Migration guide from Kedro 0.17.* to 0.18.*
* Please remove any existing `hook_impl` of the `register_config_loader` method from `ProjectHooks` (or custom alternatives).
Expand Down Expand Up @@ -90,6 +94,8 @@
"pipelines.data_science" = {alias = "ds", env = "local"}
```

* If you had any `networkx.NetworkXDataSet` entries in your catalog, replace them with `networkx.JSONDataSet`.

# Upcoming Release 0.17.5

## Major features and improvements
Expand Down
4 changes: 3 additions & 1 deletion docs/source/15_api_docs/kedro.extras.datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ kedro.extras.datasets
kedro.extras.datasets.holoviews.HoloviewsWriter
kedro.extras.datasets.json.JSONDataSet
kedro.extras.datasets.matplotlib.MatplotlibWriter
kedro.extras.datasets.networkx.NetworkXDataSet
kedro.extras.datasets.networkx.GMLDataSet
kedro.extras.datasets.networkx.GraphMLDataSet
kedro.extras.datasets.networkx.JSONDataSet
kedro.extras.datasets.pandas.CSVDataSet
kedro.extras.datasets.pandas.ExcelDataSet
kedro.extras.datasets.pandas.FeatherDataSet
Expand Down
12 changes: 9 additions & 3 deletions kedro/extras/datasets/networkx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,17 @@
# limitations under the License.

"""``AbstractDataSet`` implementation to save and load NetworkX graphs in JSON
format using ``NetworkX``."""
, GraphML and GML formats using ``NetworkX``."""

__all__ = ["NetworkXDataSet"]
__all__ = ["GMLDataSet", "GraphMLDataSet", "JSONDataSet"]

from contextlib import suppress

with suppress(ImportError):
from .networkx_dataset import NetworkXDataSet # NOQA
from .gml_dataset import GMLDataSet

with suppress(ImportError):
from .graphml_dataset import GraphMLDataSet

with suppress(ImportError):
from .json_dataset import JSONDataSet
170 changes: 170 additions & 0 deletions kedro/extras/datasets/networkx/gml_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# Copyright 2021 QuantumBlack Visual Analytics Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND
# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS
# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo
# (either separately or in combination, "QuantumBlack Trademarks") are
# trademarks of QuantumBlack. The License does not grant you any right or
# license to the QuantumBlack Trademarks. You may not use the QuantumBlack
# Trademarks or any confusingly similar mark as a trademark for your product,
# or use the QuantumBlack Trademarks in any other manner that might cause
# confusion in the marketplace, including but not limited to in advertising,
# on websites, or on software.
#
# See the License for the specific language governing permissions and
# limitations under the License.


"""NetworkX ``GMLDataSet`` loads and saves graphs to a graph modelling language (GML)
file using an underlying filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to
create GML data.
"""

from copy import deepcopy
from pathlib import PurePosixPath
from typing import Any, Dict

import fsspec
import networkx

from kedro.io.core import (
AbstractVersionedDataSet,
Version,
get_filepath_str,
get_protocol_and_path,
)


class GMLDataSet(AbstractVersionedDataSet):
"""``GMLDataSet`` loads and saves graphs to a GML file using an
underlying filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to
create GML data.
See https://networkx.org/documentation/stable/tutorial.html for details.
Example:
::
>>> from kedro.extras.datasets.networkx import GMLDataSet
>>> import networkx as nx
>>> graph = nx.complete_graph(100)
>>> graph_dataset = GMLDataSet(filepath="test.gml")
>>> graph_dataset.save(graph)
>>> reloaded = graph_dataset.load()
>>> assert nx.is_isomorphic(graph, reloaded)
"""

DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any]
DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any]

# pylint: disable=too-many-arguments
def __init__(
self,
filepath: str,
load_args: Dict[str, Any] = None,
save_args: Dict[str, Any] = None,
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``GMLDataSet``.
Args:
filepath: Filepath in POSIX format to the NetworkX GML file.
load_args: Arguments passed on to ``networkx.read_gml``.
See the details in
https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.gml.read_gml.html
save_args: Arguments passed on to ``networkx.write_gml``.
See the details in
https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.gml.write_gml.html
version: If specified, should be an instance of
``kedro.io.core.Version``. If its ``load`` attribute is
None, the latest version will be loaded. If its ``save``
attribute is None, save version will be autogenerated.
credentials: Credentials required to get access to the underlying filesystem.
E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
fs_args: Extra arguments to pass into underlying filesystem class constructor
(e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as
to pass to the filesystem's `open` method through nested keys
`open_args_load` and `open_args_save`.
Here you can find all available arguments for `open`:
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
_fs_open_args_save = _fs_args.pop("open_args_save", {})
_credentials = deepcopy(credentials) or {}

protocol, path = get_protocol_and_path(filepath, version)
if protocol == "file":
_fs_args.setdefault("auto_mkdir", True)

self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

super().__init__(
filepath=PurePosixPath(path),
version=version,
exists_function=self._fs.exists,
glob_function=self._fs.glob,
)

# Handle default load and save arguments
self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS)
if load_args is not None:
self._load_args.update(load_args)
self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS)
if save_args is not None:
self._save_args.update(save_args)
_fs_open_args_load.setdefault("mode", "rb")
_fs_open_args_save.setdefault("mode", "wb")
self._fs_open_args_load = _fs_open_args_load
self._fs_open_args_save = _fs_open_args_save

def _load(self) -> networkx.Graph:
load_path = get_filepath_str(self._get_load_path(), self._protocol)
with self._fs.open(load_path, **self._fs_open_args_load) as fs_file:
data = networkx.read_gml(fs_file, **self._load_args)
return data

def _save(self, data: networkx.Graph) -> None:
save_path = get_filepath_str(self._get_save_path(), self._protocol)
with self._fs.open(save_path, **self._fs_open_args_save) as fs_file:
networkx.write_gml(data, fs_file, **self._save_args)
self._invalidate_cache()

def _exists(self) -> bool:
load_path = get_filepath_str(self._get_load_path(), self._protocol)
return self._fs.exists(load_path)

def _describe(self) -> Dict[str, Any]:
return dict(
filepath=self._filepath,
protocol=self._protocol,
load_args=self._load_args,
save_args=self._save_args,
version=self._version,
)

def _release(self) -> None:
super()._release()
self._invalidate_cache()

def _invalidate_cache(self) -> None:
"""Invalidate underlying filesystem caches."""
filepath = get_filepath_str(self._filepath, self._protocol)
self._fs.invalidate_cache(filepath)
Loading

0 comments on commit 814d932

Please sign in to comment.