Dcat mapper (#4)

* Added Dcat model * Bootstrapped DcatPackage.from_xml * Added mode dcat mappings * Updated model methods * Finished dcat package props * Improved model methods * Mapped dcat resource * Added todos * Renamed parsers -> loaders/dumpers * Improved model methods * Added platform * Removed platform * Removed todos * Mapped single value to graph * Added DcatPackage.from/to_graph * Added dcat namespaces * Mapped package lists * FIxed dcat model * Fixnished dcat model mapping * Sorted DcatResource props * Sorted DcatPackage props * Implemented dcat to dp * Implemented dp to dcat
frictionlessdata · Dec 7, 2023 · 4ad6f9f · 4ad6f9f
1 parent 410c5c0
commit 4ad6f9f
Show file tree

Hide file tree

Showing 33 changed files with 856 additions and 59 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
-# dplib-py
+# Data Packaging Library
 
 [![Build](https://img.shields.io/github/actions/workflow/status/frictionlessdata/dplib-py/general.yaml?branch=main)](https://github.com/frictionlessdata/dplib-py/actions)
 [![Coverage](https://img.shields.io/codecov/c/github/frictionlessdata/dplib-py/main)](https://codecov.io/gh/frictionlessdata/dplib-py)
 [![Release](https://img.shields.io/pypi/v/dplib-py.svg)](https://pypi.python.org/pypi/dplib-py)
 [![Codebase](https://img.shields.io/badge/codebase-github-brightgreen)](https://github.com/frictionlessdata/dplib-py)
 
-Python implementation of the Data Package standard
+Python implementation of the Data Package standard and various models and utils for working with datasets.
diff --git a/dplib/actions/__init__.py b/dplib/actions/__init__.py
diff --git a/dplib/actions/schema/__init__.py b/dplib/actions/schema/__init__.py
diff --git a/dplib/actions/schema/check.py b/dplib/actions/schema/check.py
@@ -0,0 +1,14 @@
+#  from pydantic import BaseModel, ValidationError
+#  from pydantic_core import ErrorDetails
+
+#  def schema_check(cls, descriptor: Dict[str, Any]):
+#  errors: List[ErrorDetails] = []
+#  try:
+#  cls.model_validate(descriptor)
+#  except ValidationError as e:
+#  errors = e.errors()
+#  return errors
+
+
+def schema_check():
+    pass
diff --git a/dplib/error.py b/dplib/error.py
@@ -0,0 +1,2 @@
+class Error(Exception):
+    pass
diff --git a/dplib/helpers/file.py b/dplib/helpers/file.py
@@ -0,0 +1,45 @@
+import os
+import shutil
+import tempfile
+from pathlib import Path
+from typing import Any, Optional
+
+import fsspec  # type: ignore
+
+from ..error import Error
+
+
+def read_file(path: str, *, mode: str = "rt", encoding: str = "utf-8") -> str:
+    try:
+        with fsspec.open(path, mode=mode, encoding=encoding) as file:  # type: ignore
+            return file.read()  # type: ignore
+    except Exception as exception:
+        raise Error(f'Cannot read file "{path}": {exception}')
+
+
+def write_file(path: str, body: Any, *, mode: str = "wt", encoding: str = "utf-8"):
+    try:
+        eff_enc = encoding if mode == "wt" else None
+        with tempfile.NamedTemporaryFile(mode, delete=False, encoding=eff_enc) as file:
+            file.write(body)
+            file.flush()
+        move_file(file.name, path, mode=0o644)
+    except Exception as exception:
+        raise Error(f'Cannot write file "{path}": {exception}')
+
+
+def move_file(source: str, target: str, *, mode: Optional[int] = None):
+    try:
+        Path(target).parent.mkdir(parents=True, exist_ok=True)
+        shutil.move(source, target)
+        if mode:
+            os.chmod(target, 0o644)
+    except Exception as exception:
+        raise Error(f'Cannot move file "{source}:{target}": {exception}')
+
+
+def infer_format(path: str):
+    format = Path(path).suffix[1:]
+    if format == "yml":
+        format = "yaml"
+    return format or None
diff --git a/dplib/helpers/resource.py b/dplib/helpers/resource.py
@@ -3,5 +3,5 @@
 from slugify import slugify
 
 
-def path_to_name(path: str) -> str:
-    return slugify(Path(path).stem, separator="_")
+def slugify_name(name: str) -> str:
+    return slugify(Path(name).stem, separator="_")
diff --git a/dplib/model.py b/dplib/model.py
@@ -1,54 +1,67 @@
+from __future__ import annotations
+
+import json
 import pprint
-from typing import Any, Dict, List
+from importlib import import_module
+from typing import Optional
 
-from pydantic import BaseModel, ValidationError
-from pydantic_core import ErrorDetails
+from pydantic import BaseModel
+from typing_extensions import Self
 
 from . import types
+from .error import Error
+from .helpers.file import infer_format, read_file, write_file
 
 
 class Model(BaseModel, extra="forbid", validate_assignment=True):
-    custom: types.IData = {}
+    custom: types.IDict = {}
 
     def __str__(self) -> str:
         return repr(self)
 
     def __repr__(self) -> str:
         return pprint.pformat(self.to_dict(), sort_dicts=False)
 
-    # Validators
-
-    # TODO: rebase on validate_yaml/json/dict?
-    @classmethod
-    def validate_descriptor(cls, descriptor: Dict[str, Any]):
-        errors: List[ErrorDetails] = []
-        try:
-            cls.model_validate(descriptor)
-        except ValidationError as e:
-            errors = e.errors()
-        return errors
+    # Converters
 
-    # Mappers
+    def to_path(self, path: str, *, format: Optional[str] = None):
+        format = format or infer_format(path)
+        if not format:
+            raise Error(f"Cannot infer format from path: {path}")
+        text = self.to_text(format=format)
+        write_file(path, text)
 
     @classmethod
-    def from_yaml(cls, path: str):
-        pass
+    def from_path(cls, path: str, *, format: Optional[str] = None) -> Self:
+        format = format or infer_format(path)
+        if not format:
+            raise Error(f"Cannot infer format from path: {path}")
+        text = read_file(path)
+        return cls.from_text(text, format=format)  # type: ignore
 
-    @classmethod
-    def to_yaml(cls, path: str):
-        pass
+    def to_text(self, *, format: str) -> str:
+        data = self.to_dict()
+        if format == "json":
+            return json.dumps(data)
+        elif format == "yaml":
+            yaml = import_module("yaml")
+            return yaml.dump(data)
+        raise Error(f"Cannot convert to text for format: {format}")
 
     @classmethod
-    def from_json(cls, path: str):
-        pass
+    def from_text(cls, text: str, *, format: str) -> Self:
+        if format == "json":
+            data = json.loads(text)
+            return cls.from_dict(data)
+        elif format == "yaml":
+            yaml = import_module("yaml")
+            data = yaml.load(text)
+            return cls.from_dict(data)
+        raise Error(f"Cannot create from text with format: {format}")
 
-    @classmethod
-    def to_json(cls, path: str):
-        pass
+    def to_dict(self):
+        return self.model_dump(mode="json", exclude_unset=True, exclude_none=True)
 
     @classmethod
-    def from_dict(cls, data: types.IData):
+    def from_dict(cls, data: types.IDict) -> Self:
         return cls(**data)
-
-    def to_dict(self):
-        return self.model_dump(mode="json", exclude_unset=True, exclude_none=True)
diff --git a/dplib/models/resource/resource.py b/dplib/models/resource/resource.py
@@ -17,7 +17,7 @@ class Resource(Model):
     profile: Optional[str] = None
 
     path: Optional[str] = None
-    data: Optional[types.IData] = None
+    data: Optional[types.IDict] = None
 
     dialect: Optional[Dialect] = None
     schema: Optional[Schema] = None  # type: ignore
@@ -29,9 +29,9 @@ class Resource(Model):
     encoding: Optional[str] = None
     bytes: Optional[int] = None
     hash: Optional[str] = None
-    sources: Optional[List[Source]] = None
-    licenses: Optional[List[License]] = None
-    contributors: Optional[List[Contributor]] = None
+    sources: List[Source] = []
+    licenses: List[License] = []
+    contributors: List[Contributor] = []
 
     @property
     def parsed_hash(self) -> Optional[ParsedHash]:

diff --git a/dplib/plugins/ckan/models/package.py b/dplib/plugins/ckan/models/package.py
@@ -35,7 +35,7 @@ class CkanPackage(Model):
     metadata_created: Optional[str] = None
     metadata_modified: Optional[str] = None
 
-    # Mappers
+    # Converters
 
     def to_dp(self):
         package = Package()

diff --git a/dplib/plugins/ckan/models/resource.py b/dplib/plugins/ckan/models/resource.py
@@ -2,7 +2,7 @@
 
 from typing import Optional
 
-from dplib.helpers.resource import path_to_name
+from dplib.helpers.resource import slugify_name
 from dplib.model import Model
 from dplib.models import Resource
 
@@ -19,10 +19,10 @@ class CkanResource(Model):
     mimetype: Optional[str] = None
     size: Optional[int] = None
 
-    # Mappers
+    # Converters
 
     def to_dp(self) -> Resource:
-        resource = Resource(path=self.name, name=path_to_name(self.name))
+        resource = Resource(path=self.name, name=slugify_name(self.name))
 
         # Format
         if self.format:

diff --git a/dplib/plugins/cli/__init__.py b/dplib/plugins/cli/__init__.py
diff --git a/dplib/plugins/datacite/models/package.py b/dplib/plugins/datacite/models/package.py
@@ -33,7 +33,7 @@ class DatacitePackage(Model):
     subjects: List[DataciteSubject] = []
     titles: List[DataciteTitle] = []
 
-    # Mappers
+    # Converters
 
     def to_dp(self) -> Package:
         package = Package()

diff --git a/dplib/plugins/dcat/models/__init__.py b/dplib/plugins/dcat/models/__init__.py
@@ -0,0 +1,2 @@
+from .package import DcatPackage
+from .resource import DcatResource
diff --git a/dplib/plugins/dcat/models/dumpers.py b/dplib/plugins/dcat/models/dumpers.py
@@ -0,0 +1,17 @@
+from typing import Any
+
+from rdflib import Graph, URIRef
+
+from .helpers import create_node
+from .types import ISubject
+
+
+def id(g: Graph, identifier: str, *, predicate: URIRef, object: URIRef):
+    subject = URIRef(identifier)
+    g.add((subject, predicate, object))
+    return subject
+
+
+def node(g: Graph, value: Any, *, subject: ISubject, predicate: URIRef):
+    object = create_node(value)
+    g.add((subject, predicate, object))
diff --git a/dplib/plugins/dcat/models/helpers.py b/dplib/plugins/dcat/models/helpers.py
@@ -0,0 +1,29 @@
+from typing import Any, Union
+from urllib.parse import quote
+
+from rdflib import Literal, URIRef
+
+
+# https://github.com/ckan/ckanext-dcat/blob/master/ckanext/dcat/profiles.py
+def create_node(value: Any) -> Union[URIRef, Literal]:
+    try:
+        stripped_value = value.strip()
+        if stripped_value.startswith("http://") or stripped_value.startswith("https://"):
+            # only encode this limited subset of characters to avoid more complex URL parsing
+            # (e.g. valid ? in query string vs. ? as value).
+            # can be applied multiple times, as encoded %xy is left untouched. Therefore, no
+            # unquote is necessary beforehand.
+            quotechars = " !\"$'()*,;<>[]{|}\\^`"
+            for c in quotechars:
+                value = value.replace(c, quote(c))
+            # although all invalid chars checked by rdflib should have been quoted, try to serialize
+            # the object. If it breaks, use Literal instead.
+            value = URIRef(value)
+            value.n3()
+            # URI is fine, return the object
+            return value
+        else:
+            return Literal(value)
+    except Exception:
+        # In case something goes wrong: use Literal
+        return Literal(value)
diff --git a/dplib/plugins/dcat/models/loaders.py b/dplib/plugins/dcat/models/loaders.py
@@ -0,0 +1,57 @@
+from typing import List, Optional
+
+from rdflib import Graph, Literal, URIRef
+
+from .types import IStringNode, ISubject
+
+
+def id(g: Graph, *, predicate: URIRef, object: URIRef) -> Optional[URIRef]:
+    try:
+        id = g.value(predicate=predicate, object=object)
+        if isinstance(id, URIRef):
+            return id
+    except Exception:
+        pass
+
+
+def node(g: Graph, *, subject: ISubject, predicate: URIRef) -> Optional[IStringNode]:
+    default_lang = "en"
+    items = list(g.objects(subject, predicate))
+
+    # Prefer the default language
+    for item in items:
+        if isinstance(item, Literal):
+            if item.language and item.language == default_lang:
+                return item
+
+    # Otherwise, return the first item
+    for item in items:
+        if isinstance(item, (URIRef, Literal)):
+            return item
+
+
+def string(g: Graph, *, subject: ISubject, predicate: URIRef) -> Optional[str]:
+    value = node(g, subject=subject, predicate=predicate)
+    if value:
+        return str(value)
+
+
+def integer(g: Graph, *, subject: ISubject, predicate: URIRef) -> Optional[int]:
+    value = node(g, subject=subject, predicate=predicate)
+    if value:
+        try:
+            return int(value)
+        except Exception:
+            pass
+
+
+def nodes(g: Graph, *, subject: ISubject, predicate: URIRef) -> List[IStringNode]:
+    return [
+        item
+        for item in g.objects(subject, predicate)
+        if isinstance(item, (URIRef, Literal))
+    ]
+
+
+def strings(g: Graph, *, subject: ISubject, predicate: URIRef) -> List[str]:
+    return [str(item) for item in nodes(g, subject=subject, predicate=predicate)]
diff --git a/dplib/plugins/dcat/models/namespaces.py b/dplib/plugins/dcat/models/namespaces.py
@@ -0,0 +1,44 @@
+from rdflib import Namespace
+from rdflib.namespace import FOAF, RDF
+
+ADMS = Namespace("http://www.w3.org/ns/adms#")
+DCAT = Namespace("http://www.w3.org/ns/dcat#")
+DCT = Namespace("http://purl.org/dc/terms/")
+OWL = Namespace("http://www.w3.org/2002/07/owl#")
+
+ACCESS_URL = DCAT.accessURL
+ACCURAL_PERIODICITY = DCT.accrualPeriodicity
+ALTERNATE_IDENTIFIER = ADMS.identifier
+BYTE_SIZE = DCAT.byteSize
+COMFORMS_TO = DCT.conformsTo
+DATASET = DCAT.Dataset
+DESCRIPTION = DCT.description
+DISTRIBUTION = DCAT.distribution
+DOWNLOAD_URL = DCAT.downloadURL
+HAS_VERSION = DCT.hasVersion
+HOMEPAGE = FOAF.homepage
+IDENTIFIER = DCT.identifier
+ISSUED = DCT.issued
+IS_VERSION_OF = DCT.isVersionOf
+KEYWORD = DCAT.keyword
+LANDING_PAGE = DCAT.landingPage
+LANGUAGE = DCT.language
+LICENSE = DCT.license
+MEDIA_TYPE = DCAT.mediaType
+MODIFIED = DCT.modified
+PAGE = FOAF.page
+PROVENANCE = DCT.provenance
+RELATED_RESOURCE = DCT.relation
+SAMPLE = ADMS.sample
+SOURCE = DCT.source
+THEME = DCAT.theme
+TITLE = DCT.title
+TYPE = RDF.type
+VERSION = OWL.versionInfo
+
+BINDINGS = {
+    "adms": ADMS,
+    "dcat": DCAT,
+    "dct": DCT,
+    "owl": OWL,
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .package import DcatPackage
		from .resource import DcatResource