From 4ad6f9f2974a63b58c641c8c66314d4df6d4fa48 Mon Sep 17 00:00:00 2001 From: roll Date: Thu, 7 Dec 2023 15:50:43 +0000 Subject: [PATCH] Dcat mapper (#4) * Added Dcat model * Bootstrapped DcatPackage.from_xml * Added mode dcat mappings * Updated model methods * Finished dcat package props * Improved model methods * Mapped dcat resource * Added todos * Renamed parsers -> loaders/dumpers * Improved model methods * Added platform * Removed platform * Removed todos * Mapped single value to graph * Added DcatPackage.from/to_graph * Added dcat namespaces * Mapped package lists * FIxed dcat model * Fixnished dcat model mapping * Sorted DcatResource props * Sorted DcatPackage props * Implemented dcat to dp * Implemented dp to dcat --- README.md | 4 +- dplib/actions/__init__.py | 0 dplib/actions/schema/__init__.py | 0 dplib/actions/schema/check.py | 14 + dplib/error.py | 2 + dplib/helpers/file.py | 45 +++ dplib/helpers/resource.py | 4 +- dplib/model.py | 73 +++-- dplib/models/resource/resource.py | 8 +- dplib/plugins/ckan/models/package.py | 2 +- dplib/plugins/ckan/models/resource.py | 6 +- dplib/plugins/cli/__init__.py | 0 dplib/plugins/datacite/models/package.py | 2 +- dplib/plugins/dcat/models/__init__.py | 2 + dplib/plugins/dcat/models/dumpers.py | 17 ++ dplib/plugins/dcat/models/helpers.py | 29 ++ dplib/plugins/dcat/models/loaders.py | 57 ++++ dplib/plugins/dcat/models/namespaces.py | 44 +++ dplib/plugins/dcat/models/package.py | 356 +++++++++++++++++++++++ dplib/plugins/dcat/models/resource.py | 208 +++++++++++++ dplib/plugins/dcat/models/types.py | 6 + dplib/plugins/github/models/package.py | 2 +- dplib/plugins/github/models/resource.py | 6 +- dplib/plugins/pandas/models/field.py | 2 +- dplib/plugins/pandas/models/schema.py | 2 +- dplib/plugins/polars/models/field.py | 2 +- dplib/plugins/polars/models/schema.py | 2 +- dplib/plugins/sql/models/field.py | 2 +- dplib/plugins/sql/models/schema.py | 2 +- dplib/plugins/zenodo/models/package.py | 2 +- dplib/plugins/zenodo/models/resource.py | 6 +- dplib/types.py | 2 +- pyproject.toml | 6 +- 33 files changed, 856 insertions(+), 59 deletions(-) create mode 100644 dplib/actions/__init__.py create mode 100644 dplib/actions/schema/__init__.py create mode 100644 dplib/actions/schema/check.py create mode 100644 dplib/error.py create mode 100644 dplib/helpers/file.py create mode 100644 dplib/plugins/cli/__init__.py create mode 100644 dplib/plugins/dcat/models/dumpers.py create mode 100644 dplib/plugins/dcat/models/helpers.py create mode 100644 dplib/plugins/dcat/models/loaders.py create mode 100644 dplib/plugins/dcat/models/namespaces.py create mode 100644 dplib/plugins/dcat/models/types.py diff --git a/README.md b/README.md index eaec119..415aa92 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ -# dplib-py +# Data Packaging Library [![Build](https://img.shields.io/github/actions/workflow/status/frictionlessdata/dplib-py/general.yaml?branch=main)](https://github.com/frictionlessdata/dplib-py/actions) [![Coverage](https://img.shields.io/codecov/c/github/frictionlessdata/dplib-py/main)](https://codecov.io/gh/frictionlessdata/dplib-py) [![Release](https://img.shields.io/pypi/v/dplib-py.svg)](https://pypi.python.org/pypi/dplib-py) [![Codebase](https://img.shields.io/badge/codebase-github-brightgreen)](https://github.com/frictionlessdata/dplib-py) -Python implementation of the Data Package standard +Python implementation of the Data Package standard and various models and utils for working with datasets. diff --git a/dplib/actions/__init__.py b/dplib/actions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dplib/actions/schema/__init__.py b/dplib/actions/schema/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dplib/actions/schema/check.py b/dplib/actions/schema/check.py new file mode 100644 index 0000000..1f12771 --- /dev/null +++ b/dplib/actions/schema/check.py @@ -0,0 +1,14 @@ +# from pydantic import BaseModel, ValidationError +# from pydantic_core import ErrorDetails + +# def schema_check(cls, descriptor: Dict[str, Any]): +# errors: List[ErrorDetails] = [] +# try: +# cls.model_validate(descriptor) +# except ValidationError as e: +# errors = e.errors() +# return errors + + +def schema_check(): + pass diff --git a/dplib/error.py b/dplib/error.py new file mode 100644 index 0000000..9c69c25 --- /dev/null +++ b/dplib/error.py @@ -0,0 +1,2 @@ +class Error(Exception): + pass diff --git a/dplib/helpers/file.py b/dplib/helpers/file.py new file mode 100644 index 0000000..87e6ce7 --- /dev/null +++ b/dplib/helpers/file.py @@ -0,0 +1,45 @@ +import os +import shutil +import tempfile +from pathlib import Path +from typing import Any, Optional + +import fsspec # type: ignore + +from ..error import Error + + +def read_file(path: str, *, mode: str = "rt", encoding: str = "utf-8") -> str: + try: + with fsspec.open(path, mode=mode, encoding=encoding) as file: # type: ignore + return file.read() # type: ignore + except Exception as exception: + raise Error(f'Cannot read file "{path}": {exception}') + + +def write_file(path: str, body: Any, *, mode: str = "wt", encoding: str = "utf-8"): + try: + eff_enc = encoding if mode == "wt" else None + with tempfile.NamedTemporaryFile(mode, delete=False, encoding=eff_enc) as file: + file.write(body) + file.flush() + move_file(file.name, path, mode=0o644) + except Exception as exception: + raise Error(f'Cannot write file "{path}": {exception}') + + +def move_file(source: str, target: str, *, mode: Optional[int] = None): + try: + Path(target).parent.mkdir(parents=True, exist_ok=True) + shutil.move(source, target) + if mode: + os.chmod(target, 0o644) + except Exception as exception: + raise Error(f'Cannot move file "{source}:{target}": {exception}') + + +def infer_format(path: str): + format = Path(path).suffix[1:] + if format == "yml": + format = "yaml" + return format or None diff --git a/dplib/helpers/resource.py b/dplib/helpers/resource.py index 26f2721..89aace4 100644 --- a/dplib/helpers/resource.py +++ b/dplib/helpers/resource.py @@ -3,5 +3,5 @@ from slugify import slugify -def path_to_name(path: str) -> str: - return slugify(Path(path).stem, separator="_") +def slugify_name(name: str) -> str: + return slugify(Path(name).stem, separator="_") diff --git a/dplib/model.py b/dplib/model.py index 9a1e5b2..a4c22da 100644 --- a/dplib/model.py +++ b/dplib/model.py @@ -1,14 +1,20 @@ +from __future__ import annotations + +import json import pprint -from typing import Any, Dict, List +from importlib import import_module +from typing import Optional -from pydantic import BaseModel, ValidationError -from pydantic_core import ErrorDetails +from pydantic import BaseModel +from typing_extensions import Self from . import types +from .error import Error +from .helpers.file import infer_format, read_file, write_file class Model(BaseModel, extra="forbid", validate_assignment=True): - custom: types.IData = {} + custom: types.IDict = {} def __str__(self) -> str: return repr(self) @@ -16,39 +22,46 @@ def __str__(self) -> str: def __repr__(self) -> str: return pprint.pformat(self.to_dict(), sort_dicts=False) - # Validators - - # TODO: rebase on validate_yaml/json/dict? - @classmethod - def validate_descriptor(cls, descriptor: Dict[str, Any]): - errors: List[ErrorDetails] = [] - try: - cls.model_validate(descriptor) - except ValidationError as e: - errors = e.errors() - return errors + # Converters - # Mappers + def to_path(self, path: str, *, format: Optional[str] = None): + format = format or infer_format(path) + if not format: + raise Error(f"Cannot infer format from path: {path}") + text = self.to_text(format=format) + write_file(path, text) @classmethod - def from_yaml(cls, path: str): - pass + def from_path(cls, path: str, *, format: Optional[str] = None) -> Self: + format = format or infer_format(path) + if not format: + raise Error(f"Cannot infer format from path: {path}") + text = read_file(path) + return cls.from_text(text, format=format) # type: ignore - @classmethod - def to_yaml(cls, path: str): - pass + def to_text(self, *, format: str) -> str: + data = self.to_dict() + if format == "json": + return json.dumps(data) + elif format == "yaml": + yaml = import_module("yaml") + return yaml.dump(data) + raise Error(f"Cannot convert to text for format: {format}") @classmethod - def from_json(cls, path: str): - pass + def from_text(cls, text: str, *, format: str) -> Self: + if format == "json": + data = json.loads(text) + return cls.from_dict(data) + elif format == "yaml": + yaml = import_module("yaml") + data = yaml.load(text) + return cls.from_dict(data) + raise Error(f"Cannot create from text with format: {format}") - @classmethod - def to_json(cls, path: str): - pass + def to_dict(self): + return self.model_dump(mode="json", exclude_unset=True, exclude_none=True) @classmethod - def from_dict(cls, data: types.IData): + def from_dict(cls, data: types.IDict) -> Self: return cls(**data) - - def to_dict(self): - return self.model_dump(mode="json", exclude_unset=True, exclude_none=True) diff --git a/dplib/models/resource/resource.py b/dplib/models/resource/resource.py index b6f32e1..fb92261 100644 --- a/dplib/models/resource/resource.py +++ b/dplib/models/resource/resource.py @@ -17,7 +17,7 @@ class Resource(Model): profile: Optional[str] = None path: Optional[str] = None - data: Optional[types.IData] = None + data: Optional[types.IDict] = None dialect: Optional[Dialect] = None schema: Optional[Schema] = None # type: ignore @@ -29,9 +29,9 @@ class Resource(Model): encoding: Optional[str] = None bytes: Optional[int] = None hash: Optional[str] = None - sources: Optional[List[Source]] = None - licenses: Optional[List[License]] = None - contributors: Optional[List[Contributor]] = None + sources: List[Source] = [] + licenses: List[License] = [] + contributors: List[Contributor] = [] @property def parsed_hash(self) -> Optional[ParsedHash]: diff --git a/dplib/plugins/ckan/models/package.py b/dplib/plugins/ckan/models/package.py index 76c9afa..fa89dc1 100644 --- a/dplib/plugins/ckan/models/package.py +++ b/dplib/plugins/ckan/models/package.py @@ -35,7 +35,7 @@ class CkanPackage(Model): metadata_created: Optional[str] = None metadata_modified: Optional[str] = None - # Mappers + # Converters def to_dp(self): package = Package() diff --git a/dplib/plugins/ckan/models/resource.py b/dplib/plugins/ckan/models/resource.py index 71054b3..674e7ea 100644 --- a/dplib/plugins/ckan/models/resource.py +++ b/dplib/plugins/ckan/models/resource.py @@ -2,7 +2,7 @@ from typing import Optional -from dplib.helpers.resource import path_to_name +from dplib.helpers.resource import slugify_name from dplib.model import Model from dplib.models import Resource @@ -19,10 +19,10 @@ class CkanResource(Model): mimetype: Optional[str] = None size: Optional[int] = None - # Mappers + # Converters def to_dp(self) -> Resource: - resource = Resource(path=self.name, name=path_to_name(self.name)) + resource = Resource(path=self.name, name=slugify_name(self.name)) # Format if self.format: diff --git a/dplib/plugins/cli/__init__.py b/dplib/plugins/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dplib/plugins/datacite/models/package.py b/dplib/plugins/datacite/models/package.py index 13074c3..523b25a 100644 --- a/dplib/plugins/datacite/models/package.py +++ b/dplib/plugins/datacite/models/package.py @@ -33,7 +33,7 @@ class DatacitePackage(Model): subjects: List[DataciteSubject] = [] titles: List[DataciteTitle] = [] - # Mappers + # Converters def to_dp(self) -> Package: package = Package() diff --git a/dplib/plugins/dcat/models/__init__.py b/dplib/plugins/dcat/models/__init__.py index e69de29..9ac46c1 100644 --- a/dplib/plugins/dcat/models/__init__.py +++ b/dplib/plugins/dcat/models/__init__.py @@ -0,0 +1,2 @@ +from .package import DcatPackage +from .resource import DcatResource diff --git a/dplib/plugins/dcat/models/dumpers.py b/dplib/plugins/dcat/models/dumpers.py new file mode 100644 index 0000000..b3b158e --- /dev/null +++ b/dplib/plugins/dcat/models/dumpers.py @@ -0,0 +1,17 @@ +from typing import Any + +from rdflib import Graph, URIRef + +from .helpers import create_node +from .types import ISubject + + +def id(g: Graph, identifier: str, *, predicate: URIRef, object: URIRef): + subject = URIRef(identifier) + g.add((subject, predicate, object)) + return subject + + +def node(g: Graph, value: Any, *, subject: ISubject, predicate: URIRef): + object = create_node(value) + g.add((subject, predicate, object)) diff --git a/dplib/plugins/dcat/models/helpers.py b/dplib/plugins/dcat/models/helpers.py new file mode 100644 index 0000000..521e51e --- /dev/null +++ b/dplib/plugins/dcat/models/helpers.py @@ -0,0 +1,29 @@ +from typing import Any, Union +from urllib.parse import quote + +from rdflib import Literal, URIRef + + +# https://github.com/ckan/ckanext-dcat/blob/master/ckanext/dcat/profiles.py +def create_node(value: Any) -> Union[URIRef, Literal]: + try: + stripped_value = value.strip() + if stripped_value.startswith("http://") or stripped_value.startswith("https://"): + # only encode this limited subset of characters to avoid more complex URL parsing + # (e.g. valid ? in query string vs. ? as value). + # can be applied multiple times, as encoded %xy is left untouched. Therefore, no + # unquote is necessary beforehand. + quotechars = " !\"$'()*,;<>[]{|}\\^`" + for c in quotechars: + value = value.replace(c, quote(c)) + # although all invalid chars checked by rdflib should have been quoted, try to serialize + # the object. If it breaks, use Literal instead. + value = URIRef(value) + value.n3() + # URI is fine, return the object + return value + else: + return Literal(value) + except Exception: + # In case something goes wrong: use Literal + return Literal(value) diff --git a/dplib/plugins/dcat/models/loaders.py b/dplib/plugins/dcat/models/loaders.py new file mode 100644 index 0000000..261d508 --- /dev/null +++ b/dplib/plugins/dcat/models/loaders.py @@ -0,0 +1,57 @@ +from typing import List, Optional + +from rdflib import Graph, Literal, URIRef + +from .types import IStringNode, ISubject + + +def id(g: Graph, *, predicate: URIRef, object: URIRef) -> Optional[URIRef]: + try: + id = g.value(predicate=predicate, object=object) + if isinstance(id, URIRef): + return id + except Exception: + pass + + +def node(g: Graph, *, subject: ISubject, predicate: URIRef) -> Optional[IStringNode]: + default_lang = "en" + items = list(g.objects(subject, predicate)) + + # Prefer the default language + for item in items: + if isinstance(item, Literal): + if item.language and item.language == default_lang: + return item + + # Otherwise, return the first item + for item in items: + if isinstance(item, (URIRef, Literal)): + return item + + +def string(g: Graph, *, subject: ISubject, predicate: URIRef) -> Optional[str]: + value = node(g, subject=subject, predicate=predicate) + if value: + return str(value) + + +def integer(g: Graph, *, subject: ISubject, predicate: URIRef) -> Optional[int]: + value = node(g, subject=subject, predicate=predicate) + if value: + try: + return int(value) + except Exception: + pass + + +def nodes(g: Graph, *, subject: ISubject, predicate: URIRef) -> List[IStringNode]: + return [ + item + for item in g.objects(subject, predicate) + if isinstance(item, (URIRef, Literal)) + ] + + +def strings(g: Graph, *, subject: ISubject, predicate: URIRef) -> List[str]: + return [str(item) for item in nodes(g, subject=subject, predicate=predicate)] diff --git a/dplib/plugins/dcat/models/namespaces.py b/dplib/plugins/dcat/models/namespaces.py new file mode 100644 index 0000000..08e4604 --- /dev/null +++ b/dplib/plugins/dcat/models/namespaces.py @@ -0,0 +1,44 @@ +from rdflib import Namespace +from rdflib.namespace import FOAF, RDF + +ADMS = Namespace("http://www.w3.org/ns/adms#") +DCAT = Namespace("http://www.w3.org/ns/dcat#") +DCT = Namespace("http://purl.org/dc/terms/") +OWL = Namespace("http://www.w3.org/2002/07/owl#") + +ACCESS_URL = DCAT.accessURL +ACCURAL_PERIODICITY = DCT.accrualPeriodicity +ALTERNATE_IDENTIFIER = ADMS.identifier +BYTE_SIZE = DCAT.byteSize +COMFORMS_TO = DCT.conformsTo +DATASET = DCAT.Dataset +DESCRIPTION = DCT.description +DISTRIBUTION = DCAT.distribution +DOWNLOAD_URL = DCAT.downloadURL +HAS_VERSION = DCT.hasVersion +HOMEPAGE = FOAF.homepage +IDENTIFIER = DCT.identifier +ISSUED = DCT.issued +IS_VERSION_OF = DCT.isVersionOf +KEYWORD = DCAT.keyword +LANDING_PAGE = DCAT.landingPage +LANGUAGE = DCT.language +LICENSE = DCT.license +MEDIA_TYPE = DCAT.mediaType +MODIFIED = DCT.modified +PAGE = FOAF.page +PROVENANCE = DCT.provenance +RELATED_RESOURCE = DCT.relation +SAMPLE = ADMS.sample +SOURCE = DCT.source +THEME = DCAT.theme +TITLE = DCT.title +TYPE = RDF.type +VERSION = OWL.versionInfo + +BINDINGS = { + "adms": ADMS, + "dcat": DCAT, + "dct": DCT, + "owl": OWL, +} diff --git a/dplib/plugins/dcat/models/package.py b/dplib/plugins/dcat/models/package.py index e69de29..b4487ea 100644 --- a/dplib/plugins/dcat/models/package.py +++ b/dplib/plugins/dcat/models/package.py @@ -0,0 +1,356 @@ +from __future__ import annotations + +from typing import List, Optional + +from rdflib import BNode, Graph, URIRef + +from dplib.error import Error +from dplib.model import Model +from dplib.models import Package + +from . import dumpers, loaders +from . import namespaces as ns +from .resource import DcatResource + +# References: +# - https://www.w3.org/TR/vocab-dcat-2/ +# - https://joinup.ec.europa.eu/asset/dcat_application_profile +# - https://github.com/ckan/ckanext-dcat/blob/master/ckanext/dcat/profiles.py + + +class DcatPackage(Model): + identifier: Optional[str] = None + distributions: List[DcatResource] = [] + + accural_periodicity: Optional[str] = None + alternate_identifiers: List[str] = [] + comforms_to: List[str] = [] + description: Optional[str] = None + has_versions: List[str] = [] + homepage: Optional[str] = None + issued: Optional[str] = None + is_version_of: List[str] = [] + keywords: List[str] = [] + landing_page: Optional[str] = None + languages: List[str] = [] + modified: Optional[str] = None + pages: List[str] = [] + provenance: Optional[str] = None + related_resources: List[str] = [] + samples: List[str] = [] + sources: List[str] = [] + themes: List[str] = [] + title: Optional[str] = None + version: Optional[str] = None + + # Converters + + def to_text(self, *, format: str): + g = self.to_graph() + return g.serialize(format=format) + + @classmethod + def from_text(cls, text: str, *, format: str): + g = Graph() + g.parse(data=text, format=format) + return cls.from_graph(g) + + def to_graph(self): + g = Graph() + for prefix, namespace in ns.BINDINGS.items(): + g.bind(prefix, namespace) + + # Identifier + if not self.identifier: + raise Error(f"Cannot dump DCAT package without identifier: {self}") + id = dumpers.id(g, self.identifier, predicate=ns.TYPE, object=ns.DATASET) + + # Accural periodicity + if self.accural_periodicity: + dumpers.node( + g, self.accural_periodicity, subject=id, predicate=ns.ACCURAL_PERIODICITY + ) + + # Alternate identifiers + for identifier in self.alternate_identifiers: + dumpers.node(g, identifier, subject=id, predicate=ns.ALTERNATE_IDENTIFIER) + + # Conforms to + for conforms_to in self.comforms_to: + dumpers.node(g, conforms_to, subject=id, predicate=ns.COMFORMS_TO) + + # Description + if self.description: + dumpers.node(g, self.description, subject=id, predicate=ns.DESCRIPTION) + + # Has versions + for has_version in self.has_versions: + dumpers.node(g, has_version, subject=id, predicate=ns.HAS_VERSION) + + # Homepage + if self.homepage: + dumpers.node(g, self.homepage, subject=id, predicate=ns.HOMEPAGE) + + # Issued + if self.issued: + dumpers.node(g, self.issued, subject=id, predicate=ns.ISSUED) + + # Is version of + for is_version_of in self.is_version_of: + dumpers.node(g, is_version_of, subject=id, predicate=ns.IS_VERSION_OF) + + # Keywords + for keyword in self.keywords: + dumpers.node(g, keyword, subject=id, predicate=ns.KEYWORD) + + # Landing page + if self.landing_page: + dumpers.node(g, self.landing_page, subject=id, predicate=ns.LANDING_PAGE) + + # Languages + for language in self.languages: + dumpers.node(g, language, subject=id, predicate=ns.LANGUAGE) + + # Modified + if self.modified: + dumpers.node(g, self.modified, subject=id, predicate=ns.MODIFIED) + + # Pages + for page in self.pages: + dumpers.node(g, page, subject=id, predicate=ns.PAGE) + + # Provenance + if self.provenance: + dumpers.node(g, self.provenance, subject=id, predicate=ns.PROVENANCE) + + # Related resources + for related_resource in self.related_resources: + dumpers.node(g, related_resource, subject=id, predicate=ns.RELATED_RESOURCE) + + # Samples + for sample in self.samples: + dumpers.node(g, sample, subject=id, predicate=ns.SAMPLE) + + # Sources + for source in self.sources: + dumpers.node(g, source, subject=id, predicate=ns.SOURCE) + + # Themes + for theme in self.themes: + dumpers.node(g, theme, subject=id, predicate=ns.THEME) + + # Title + if self.title: + dumpers.node(g, self.title, subject=id, predicate=ns.TITLE) + + # Version + if self.version: + dumpers.node(g, self.version, subject=id, predicate=ns.VERSION) + + # Distributions + for distribution in self.distributions: + distribution_id = BNode() + g.add((id, ns.DISTRIBUTION, distribution_id)) + g.add((distribution_id, ns.TYPE, ns.DISTRIBUTION)) + distribution.to_graph(g, id=distribution_id) + + return g + + @classmethod + def from_graph(cls, g: Graph): + package = DcatPackage() + + # Identifier + id = loaders.id(g, predicate=ns.TYPE, object=ns.DATASET) + if not id: + raise Error(f"Cannot load DCAT package without identifier: {g}") + package.identifier = str(id) + + # Accural periodicity + periodicity = loaders.string(g, subject=id, predicate=ns.ACCURAL_PERIODICITY) + if periodicity: + package.accural_periodicity = periodicity + + # Alternate identifiers + identifiers = loaders.strings(g, subject=id, predicate=ns.ALTERNATE_IDENTIFIER) + if identifiers: + package.alternate_identifiers = identifiers + + # Conforms to + conforms_to = loaders.strings(g, subject=id, predicate=ns.COMFORMS_TO) + if conforms_to: + package.comforms_to = conforms_to + + # Description + description = loaders.string(g, subject=id, predicate=ns.DESCRIPTION) + if description: + package.description = description + + # Has versions + has_versions = loaders.strings(g, subject=id, predicate=ns.HAS_VERSION) + if has_versions: + package.has_versions = has_versions + + # Homepage + homepage = loaders.string(g, subject=id, predicate=ns.HOMEPAGE) + if homepage: + package.homepage = homepage + + # Issued + issued = loaders.string(g, subject=id, predicate=ns.ISSUED) + if issued: + package.issued = issued + + # Is version of + is_version_of = loaders.strings(g, subject=id, predicate=ns.IS_VERSION_OF) + if is_version_of: + package.is_version_of = is_version_of + + # Keywords + keywords = loaders.strings(g, subject=id, predicate=ns.KEYWORD) + if keywords: + package.keywords = keywords + + # Landing page + landing_page = loaders.string(g, subject=id, predicate=ns.LANDING_PAGE) + if landing_page: + package.landing_page = landing_page + + # Languages + languages = loaders.strings(g, subject=id, predicate=ns.LANGUAGE) + if languages: + package.languages = languages + + # Modified + modified = loaders.string(g, subject=id, predicate=ns.MODIFIED) + if modified: + package.modified = modified + + # Pages + pages = loaders.strings(g, subject=id, predicate=ns.PAGE) + if pages: + package.pages = pages + + # Provenance + provenance = loaders.string(g, subject=id, predicate=ns.PROVENANCE) + if provenance: + package.provenance = provenance + + # Related resources + related_resources = loaders.strings(g, subject=id, predicate=ns.RELATED_RESOURCE) + if related_resources: + package.related_resources = related_resources + + # Samples + samples = loaders.strings(g, subject=id, predicate=ns.SAMPLE) + if samples: + package.samples = samples + + # Sources + sources = loaders.strings(g, subject=id, predicate=ns.SOURCE) + if sources: + package.sources = sources + + # Themes + themes = loaders.strings(g, subject=id, predicate=ns.THEME) + if themes: + package.themes = themes + + # Title + title = loaders.string(g, subject=id, predicate=ns.TITLE) + if title: + package.title = title + + # Version + version = loaders.string(g, subject=id, predicate=ns.VERSION) + if version: + package.version = version + + # Distributions + distributions = g.objects(subject=id, predicate=ns.DISTRIBUTION) + for distribution in distributions: + if isinstance(distribution, (URIRef, BNode)): + resource = DcatResource.from_graph(g, id=distribution) + package.distributions.append(resource) + + return package + + def to_dp(self): + package = Package() + + # Id + if self.identifier: + package.id = self.identifier + + # Title + if self.title: + package.title = self.title + + # Description + if self.description: + package.description = self.description + + # Version + if self.version: + package.version = self.version + + # Homepage + if self.homepage: + package.homepage = self.homepage + + # Created + if self.issued: + if "T" in self.issued: + package.created = self.issued + + # Keywords + for keyword in self.keywords: + package.keywords.append(keyword) + + # Resources + for distribution in self.distributions: + resource = distribution.to_dp() + if resource: + package.resources.append(resource) + + return package + + @classmethod + def from_dp(cls, package: Package): + dcat = DcatPackage() + + # Identifier + if package.id: + dcat.identifier = package.id + + # Title + if package.title: + dcat.title = package.title + + # Description + if package.description: + dcat.description = package.description + + # Version + if package.version: + dcat.version = package.version + + # Homepage + if package.homepage: + dcat.homepage = package.homepage + + # Issued + if package.created: + dcat.issued = package.created + + # Keywords + for keyword in package.keywords: + dcat.keywords.append(keyword) + + # Resources + for resource in package.resources: + distribution = DcatResource.from_dp(resource) + if distribution: + dcat.distributions.append(distribution) + + return dcat diff --git a/dplib/plugins/dcat/models/resource.py b/dplib/plugins/dcat/models/resource.py index e69de29..16c0937 100644 --- a/dplib/plugins/dcat/models/resource.py +++ b/dplib/plugins/dcat/models/resource.py @@ -0,0 +1,208 @@ +from __future__ import annotations + +from typing import List, Optional + +from rdflib import BNode, Graph + +from dplib.helpers.resource import slugify_name +from dplib.model import Model +from dplib.models import License, Resource + +from . import dumpers, loaders +from . import namespaces as ns +from .types import ISubject + + +class DcatResource(Model): + access_url: Optional[str] = None + byte_size: Optional[int] = None + conforms_to: List[str] = [] + description: Optional[str] = None + download_url: Optional[str] = None + issued: Optional[str] = None + languages: List[str] = [] + license: Optional[str] = None + media_type: Optional[str] = None + modified: Optional[str] = None + pages: List[str] = [] + title: Optional[str] = None + + # Converters + + def to_graph(self, g: Graph, *, id: BNode): + # Access URL + if self.access_url: + dumpers.node(g, self.access_url, subject=id, predicate=ns.ACCESS_URL) + + # Byte size + if self.byte_size: + dumpers.node(g, self.byte_size, subject=id, predicate=ns.BYTE_SIZE) + + # Conforms to + for conforms_to in self.conforms_to: + dumpers.node(g, conforms_to, subject=id, predicate=ns.COMFORMS_TO) + + # Description + if self.description: + dumpers.node(g, self.description, subject=id, predicate=ns.DESCRIPTION) + + # Download URL + if self.download_url: + dumpers.node(g, self.download_url, subject=id, predicate=ns.DOWNLOAD_URL) + + # Issued + if self.issued: + dumpers.node(g, self.issued, subject=id, predicate=ns.ISSUED) + + # Languages + for language in self.languages: + dumpers.node(g, language, subject=id, predicate=ns.LANGUAGE) + + # License + if self.license: + dumpers.node(g, self.license, subject=id, predicate=ns.LICENSE) + + # Media type + if self.media_type: + dumpers.node(g, self.media_type, subject=id, predicate=ns.MEDIA_TYPE) + + # Modified + if self.modified: + dumpers.node(g, self.modified, subject=id, predicate=ns.MODIFIED) + + # Pages + for page in self.pages: + dumpers.node(g, page, subject=id, predicate=ns.PAGE) + + # Title + if self.title: + dumpers.node(g, self.title, subject=id, predicate=ns.TITLE) + + return g + + @classmethod + def from_graph(cls, g: Graph, *, id: ISubject) -> DcatResource: + resource = DcatResource() + + # Access URL + access_url = loaders.string(g, subject=id, predicate=ns.ACCESS_URL) + if access_url: + resource.access_url = access_url + + # Byte size + byte_size = loaders.integer(g, subject=id, predicate=ns.BYTE_SIZE) + if byte_size: + resource.byte_size = byte_size + + # Conforms to + conforms_to = loaders.strings(g, subject=id, predicate=ns.COMFORMS_TO) + if conforms_to: + resource.conforms_to = conforms_to + + # Description + description = loaders.string(g, subject=id, predicate=ns.DESCRIPTION) + if description: + resource.description = description + + # Download URL + download_url = loaders.string(g, subject=id, predicate=ns.DOWNLOAD_URL) + if download_url: + resource.download_url = download_url + + # Issued + issued = loaders.string(g, subject=id, predicate=ns.ISSUED) + if issued: + resource.issued = issued + + # Languages + languages = loaders.strings(g, subject=id, predicate=ns.LANGUAGE) + if languages: + resource.languages = languages + + # License + license = loaders.string(g, subject=id, predicate=ns.LICENSE) + if license: + resource.license = license + + # Media type + media_type = loaders.string(g, subject=id, predicate=ns.MEDIA_TYPE) + if media_type: + resource.media_type = media_type + + # Modified + modified = loaders.string(g, subject=id, predicate=ns.MODIFIED) + if modified: + resource.modified = modified + + # Pages + pages = loaders.strings(g, subject=id, predicate=ns.PAGE) + if pages: + resource.pages = pages + + # Title + title = loaders.string(g, subject=id, predicate=ns.TITLE) + if title: + resource.title = title + + return resource + + def to_dp(self) -> Optional[Resource]: + if not self.download_url: + return + resource = Resource(path=self.download_url, name=slugify_name(self.download_url)) + + # Title + if self.title: + resource.title = self.title + + # Description + if self.description: + resource.description = self.description + + # Media type + if self.media_type: + resource.mediatype = self.media_type + + # Bytes + if self.byte_size: + resource.bytes = self.byte_size + + # Licenses + if self.license: + license = License(path=self.license) + resource.licenses.append(license) + + return resource + + @classmethod + def from_dp(cls, resource: Resource) -> DcatResource: + dcat = DcatResource() + + # Download URL + # TODO: improve logic -- use basepath and allow only urls + if resource.path: + dcat.download_url = resource.path + + # Title + if resource.title: + dcat.title = resource.title + + # Description + if resource.description: + dcat.description = resource.description + + # Media type + if resource.mediatype: + dcat.media_type = resource.mediatype + + # Bytes + if resource.bytes: + dcat.byte_size = resource.bytes + + # Licenses + if resource.licenses: + license = resource.licenses[0] + if license.path: + dcat.license = license.path + + return dcat diff --git a/dplib/plugins/dcat/models/types.py b/dplib/plugins/dcat/models/types.py new file mode 100644 index 0000000..df27f04 --- /dev/null +++ b/dplib/plugins/dcat/models/types.py @@ -0,0 +1,6 @@ +from typing import Union + +from rdflib import BNode, Literal, URIRef + +ISubject = Union[URIRef, BNode] +IStringNode = Union[URIRef, Literal] diff --git a/dplib/plugins/github/models/package.py b/dplib/plugins/github/models/package.py index cd19b34..eeccb86 100644 --- a/dplib/plugins/github/models/package.py +++ b/dplib/plugins/github/models/package.py @@ -29,7 +29,7 @@ class GithubPackage(Model): updated_at: Optional[str] = None topics: List[str] = [] - # Mappers + # Converters def to_dp(self): package = Package() diff --git a/dplib/plugins/github/models/resource.py b/dplib/plugins/github/models/resource.py index 8254043..055ccc2 100644 --- a/dplib/plugins/github/models/resource.py +++ b/dplib/plugins/github/models/resource.py @@ -2,7 +2,7 @@ from typing import Literal, Optional -from dplib.helpers.resource import path_to_name +from dplib.helpers.resource import slugify_name from dplib.model import Model from dplib.models import Resource @@ -18,10 +18,10 @@ class GithubResource(Model): html_url: Optional[str] = None download_url: Optional[str] = None - # Mappers + # Converters def to_dp(self): - resource = Resource(path=self.path, name=path_to_name(self.path)) + resource = Resource(path=self.path, name=slugify_name(self.path)) # Bytes if self.size: diff --git a/dplib/plugins/pandas/models/field.py b/dplib/plugins/pandas/models/field.py index c96e036..96d137d 100644 --- a/dplib/plugins/pandas/models/field.py +++ b/dplib/plugins/pandas/models/field.py @@ -17,7 +17,7 @@ class PandasField(Model, arbitrary_types_allowed=True): dtype: Any dvalue: Optional[Any] = None - # Mappers + # Converters def to_dp(self) -> Field: field = Field(name=self.name) diff --git a/dplib/plugins/pandas/models/schema.py b/dplib/plugins/pandas/models/schema.py index 8b87b9b..2a7c4b5 100644 --- a/dplib/plugins/pandas/models/schema.py +++ b/dplib/plugins/pandas/models/schema.py @@ -13,7 +13,7 @@ class PandasSchema(Model, arbitrary_types_allowed=True): df: pd.DataFrame - # Mappers + # Converters def to_dp(self) -> Schema: schema = Schema() diff --git a/dplib/plugins/polars/models/field.py b/dplib/plugins/polars/models/field.py index 8d42513..f2c3dbc 100644 --- a/dplib/plugins/polars/models/field.py +++ b/dplib/plugins/polars/models/field.py @@ -13,7 +13,7 @@ class PolarsField(Model, arbitrary_types_allowed=True): dtype: Any # dtype: pl.PolarsDataType - # Mappers + # Converters def to_dp(self) -> Field: field = Field(name=self.name) diff --git a/dplib/plugins/polars/models/schema.py b/dplib/plugins/polars/models/schema.py index 665c571..d10494d 100644 --- a/dplib/plugins/polars/models/schema.py +++ b/dplib/plugins/polars/models/schema.py @@ -13,7 +13,7 @@ class PolarsSchema(Model, arbitrary_types_allowed=True): df: pl.DataFrame - # Mappers + # Converters def to_dp(self) -> Schema: schema = Schema() diff --git a/dplib/plugins/sql/models/field.py b/dplib/plugins/sql/models/field.py index 1f5f39d..1e18fd0 100644 --- a/dplib/plugins/sql/models/field.py +++ b/dplib/plugins/sql/models/field.py @@ -19,7 +19,7 @@ class SqlField(Model, arbitrary_types_allowed=True): column: Column[Any] - # Mappers + # Converters def to_dp(self) -> Field: field = Field(name=self.column.name) diff --git a/dplib/plugins/sql/models/schema.py b/dplib/plugins/sql/models/schema.py index 86b4fc2..5cca4c7 100644 --- a/dplib/plugins/sql/models/schema.py +++ b/dplib/plugins/sql/models/schema.py @@ -15,7 +15,7 @@ class SqlSchema(Model, arbitrary_types_allowed=True): table: Table - # Mappers + # Converters def to_dp(self, *, with_metadata: bool = False) -> Schema: schema = Schema() diff --git a/dplib/plugins/zenodo/models/package.py b/dplib/plugins/zenodo/models/package.py index c37dad4..76b4e02 100644 --- a/dplib/plugins/zenodo/models/package.py +++ b/dplib/plugins/zenodo/models/package.py @@ -28,7 +28,7 @@ class ZenodoPackage(Model): updated: Optional[str] = None links: Dict[str, str] = {} - # Mappers + # Converters def to_dp(self): package = Package() diff --git a/dplib/plugins/zenodo/models/resource.py b/dplib/plugins/zenodo/models/resource.py index c5161ac..816dd44 100644 --- a/dplib/plugins/zenodo/models/resource.py +++ b/dplib/plugins/zenodo/models/resource.py @@ -2,7 +2,7 @@ from typing import Optional -from dplib.helpers.resource import path_to_name +from dplib.helpers.resource import slugify_name from dplib.model import Model from dplib.models import Resource @@ -15,10 +15,10 @@ class ZenodoResource(Model): mimetype: Optional[str] = None size: Optional[int] = None - # Mappers + # Converters def to_dp(self) -> Resource: - resource = Resource(path=self.key, name=path_to_name(self.key)) + resource = Resource(path=self.key, name=slugify_name(self.key)) # Format if self.ext: diff --git a/dplib/types.py b/dplib/types.py index ffffc11..530bf73 100644 --- a/dplib/types.py +++ b/dplib/types.py @@ -1,3 +1,3 @@ from typing import Any, Dict -IData = Dict[str, Any] +IDict = Dict[str, Any] diff --git a/pyproject.toml b/pyproject.toml index b397335..03f2a4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,12 +33,16 @@ classifiers = [ dependencies = [ "pydantic>=2.0", "python-slugify>=6.0", + "fsspec[http]>=2023.1.0", + "typing-extensions>=4.0", ] [project.optional-dependencies] -sql = ["sqlalchemy>=1.4"] +dcat = ["rdflib>=6.0"] pandas = ["pandas>=1.0", "pandas-stubs>=1.0", "numpy>=1.0", "isodate>=0.6"] polars = ["polars-lts-cpu>=0.10"] +sql = ["sqlalchemy>=1.4"] +yaml = ["pyyaml>=5.0"] dev = [ "moto", "ruff",