Skip to content

Commit

Permalink
Dcat mapper (#4)
Browse files Browse the repository at this point in the history
* Added Dcat model

* Bootstrapped DcatPackage.from_xml

* Added mode dcat mappings

* Updated model methods

* Finished dcat package props

* Improved model methods

* Mapped dcat resource

* Added todos

* Renamed parsers -> loaders/dumpers

* Improved model methods

* Added platform

* Removed platform

* Removed todos

* Mapped single value to graph

* Added DcatPackage.from/to_graph

* Added dcat namespaces

* Mapped package lists

* FIxed dcat model

* Fixnished dcat model mapping

* Sorted DcatResource props

* Sorted DcatPackage props

* Implemented dcat to dp

* Implemented dp to dcat
  • Loading branch information
roll authored Dec 7, 2023
1 parent 410c5c0 commit 4ad6f9f
Show file tree
Hide file tree
Showing 33 changed files with 856 additions and 59 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# dplib-py
# Data Packaging Library

[![Build](https://img.shields.io/github/actions/workflow/status/frictionlessdata/dplib-py/general.yaml?branch=main)](https://github.com/frictionlessdata/dplib-py/actions)
[![Coverage](https://img.shields.io/codecov/c/github/frictionlessdata/dplib-py/main)](https://codecov.io/gh/frictionlessdata/dplib-py)
[![Release](https://img.shields.io/pypi/v/dplib-py.svg)](https://pypi.python.org/pypi/dplib-py)
[![Codebase](https://img.shields.io/badge/codebase-github-brightgreen)](https://github.com/frictionlessdata/dplib-py)

Python implementation of the Data Package standard
Python implementation of the Data Package standard and various models and utils for working with datasets.
Empty file added dplib/actions/__init__.py
Empty file.
Empty file.
14 changes: 14 additions & 0 deletions dplib/actions/schema/check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# from pydantic import BaseModel, ValidationError
# from pydantic_core import ErrorDetails

# def schema_check(cls, descriptor: Dict[str, Any]):
# errors: List[ErrorDetails] = []
# try:
# cls.model_validate(descriptor)
# except ValidationError as e:
# errors = e.errors()
# return errors


def schema_check():
pass
2 changes: 2 additions & 0 deletions dplib/error.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class Error(Exception):
pass
45 changes: 45 additions & 0 deletions dplib/helpers/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
import shutil
import tempfile
from pathlib import Path
from typing import Any, Optional

import fsspec # type: ignore

from ..error import Error


def read_file(path: str, *, mode: str = "rt", encoding: str = "utf-8") -> str:
try:
with fsspec.open(path, mode=mode, encoding=encoding) as file: # type: ignore
return file.read() # type: ignore
except Exception as exception:
raise Error(f'Cannot read file "{path}": {exception}')


def write_file(path: str, body: Any, *, mode: str = "wt", encoding: str = "utf-8"):
try:
eff_enc = encoding if mode == "wt" else None
with tempfile.NamedTemporaryFile(mode, delete=False, encoding=eff_enc) as file:
file.write(body)
file.flush()
move_file(file.name, path, mode=0o644)
except Exception as exception:
raise Error(f'Cannot write file "{path}": {exception}')


def move_file(source: str, target: str, *, mode: Optional[int] = None):
try:
Path(target).parent.mkdir(parents=True, exist_ok=True)
shutil.move(source, target)
if mode:
os.chmod(target, 0o644)
except Exception as exception:
raise Error(f'Cannot move file "{source}:{target}": {exception}')


def infer_format(path: str):
format = Path(path).suffix[1:]
if format == "yml":
format = "yaml"
return format or None
4 changes: 2 additions & 2 deletions dplib/helpers/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
from slugify import slugify


def path_to_name(path: str) -> str:
return slugify(Path(path).stem, separator="_")
def slugify_name(name: str) -> str:
return slugify(Path(name).stem, separator="_")
73 changes: 43 additions & 30 deletions dplib/model.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,67 @@
from __future__ import annotations

import json
import pprint
from typing import Any, Dict, List
from importlib import import_module
from typing import Optional

from pydantic import BaseModel, ValidationError
from pydantic_core import ErrorDetails
from pydantic import BaseModel
from typing_extensions import Self

from . import types
from .error import Error
from .helpers.file import infer_format, read_file, write_file


class Model(BaseModel, extra="forbid", validate_assignment=True):
custom: types.IData = {}
custom: types.IDict = {}

def __str__(self) -> str:
return repr(self)

def __repr__(self) -> str:
return pprint.pformat(self.to_dict(), sort_dicts=False)

# Validators

# TODO: rebase on validate_yaml/json/dict?
@classmethod
def validate_descriptor(cls, descriptor: Dict[str, Any]):
errors: List[ErrorDetails] = []
try:
cls.model_validate(descriptor)
except ValidationError as e:
errors = e.errors()
return errors
# Converters

# Mappers
def to_path(self, path: str, *, format: Optional[str] = None):
format = format or infer_format(path)
if not format:
raise Error(f"Cannot infer format from path: {path}")
text = self.to_text(format=format)
write_file(path, text)

@classmethod
def from_yaml(cls, path: str):
pass
def from_path(cls, path: str, *, format: Optional[str] = None) -> Self:
format = format or infer_format(path)
if not format:
raise Error(f"Cannot infer format from path: {path}")
text = read_file(path)
return cls.from_text(text, format=format) # type: ignore

@classmethod
def to_yaml(cls, path: str):
pass
def to_text(self, *, format: str) -> str:
data = self.to_dict()
if format == "json":
return json.dumps(data)
elif format == "yaml":
yaml = import_module("yaml")
return yaml.dump(data)
raise Error(f"Cannot convert to text for format: {format}")

@classmethod
def from_json(cls, path: str):
pass
def from_text(cls, text: str, *, format: str) -> Self:
if format == "json":
data = json.loads(text)
return cls.from_dict(data)
elif format == "yaml":
yaml = import_module("yaml")
data = yaml.load(text)
return cls.from_dict(data)
raise Error(f"Cannot create from text with format: {format}")

@classmethod
def to_json(cls, path: str):
pass
def to_dict(self):
return self.model_dump(mode="json", exclude_unset=True, exclude_none=True)

@classmethod
def from_dict(cls, data: types.IData):
def from_dict(cls, data: types.IDict) -> Self:
return cls(**data)

def to_dict(self):
return self.model_dump(mode="json", exclude_unset=True, exclude_none=True)
8 changes: 4 additions & 4 deletions dplib/models/resource/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class Resource(Model):
profile: Optional[str] = None

path: Optional[str] = None
data: Optional[types.IData] = None
data: Optional[types.IDict] = None

dialect: Optional[Dialect] = None
schema: Optional[Schema] = None # type: ignore
Expand All @@ -29,9 +29,9 @@ class Resource(Model):
encoding: Optional[str] = None
bytes: Optional[int] = None
hash: Optional[str] = None
sources: Optional[List[Source]] = None
licenses: Optional[List[License]] = None
contributors: Optional[List[Contributor]] = None
sources: List[Source] = []
licenses: List[License] = []
contributors: List[Contributor] = []

@property
def parsed_hash(self) -> Optional[ParsedHash]:
Expand Down
2 changes: 1 addition & 1 deletion dplib/plugins/ckan/models/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class CkanPackage(Model):
metadata_created: Optional[str] = None
metadata_modified: Optional[str] = None

# Mappers
# Converters

def to_dp(self):
package = Package()
Expand Down
6 changes: 3 additions & 3 deletions dplib/plugins/ckan/models/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import Optional

from dplib.helpers.resource import path_to_name
from dplib.helpers.resource import slugify_name
from dplib.model import Model
from dplib.models import Resource

Expand All @@ -19,10 +19,10 @@ class CkanResource(Model):
mimetype: Optional[str] = None
size: Optional[int] = None

# Mappers
# Converters

def to_dp(self) -> Resource:
resource = Resource(path=self.name, name=path_to_name(self.name))
resource = Resource(path=self.name, name=slugify_name(self.name))

# Format
if self.format:
Expand Down
Empty file added dplib/plugins/cli/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion dplib/plugins/datacite/models/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class DatacitePackage(Model):
subjects: List[DataciteSubject] = []
titles: List[DataciteTitle] = []

# Mappers
# Converters

def to_dp(self) -> Package:
package = Package()
Expand Down
2 changes: 2 additions & 0 deletions dplib/plugins/dcat/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .package import DcatPackage
from .resource import DcatResource
17 changes: 17 additions & 0 deletions dplib/plugins/dcat/models/dumpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from typing import Any

from rdflib import Graph, URIRef

from .helpers import create_node
from .types import ISubject


def id(g: Graph, identifier: str, *, predicate: URIRef, object: URIRef):
subject = URIRef(identifier)
g.add((subject, predicate, object))
return subject


def node(g: Graph, value: Any, *, subject: ISubject, predicate: URIRef):
object = create_node(value)
g.add((subject, predicate, object))
29 changes: 29 additions & 0 deletions dplib/plugins/dcat/models/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from typing import Any, Union
from urllib.parse import quote

from rdflib import Literal, URIRef


# https://github.com/ckan/ckanext-dcat/blob/master/ckanext/dcat/profiles.py
def create_node(value: Any) -> Union[URIRef, Literal]:
try:
stripped_value = value.strip()
if stripped_value.startswith("http://") or stripped_value.startswith("https://"):
# only encode this limited subset of characters to avoid more complex URL parsing
# (e.g. valid ? in query string vs. ? as value).
# can be applied multiple times, as encoded %xy is left untouched. Therefore, no
# unquote is necessary beforehand.
quotechars = " !\"$'()*,;<>[]{|}\\^`"
for c in quotechars:
value = value.replace(c, quote(c))
# although all invalid chars checked by rdflib should have been quoted, try to serialize
# the object. If it breaks, use Literal instead.
value = URIRef(value)
value.n3()
# URI is fine, return the object
return value
else:
return Literal(value)
except Exception:
# In case something goes wrong: use Literal
return Literal(value)
57 changes: 57 additions & 0 deletions dplib/plugins/dcat/models/loaders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from typing import List, Optional

from rdflib import Graph, Literal, URIRef

from .types import IStringNode, ISubject


def id(g: Graph, *, predicate: URIRef, object: URIRef) -> Optional[URIRef]:
try:
id = g.value(predicate=predicate, object=object)
if isinstance(id, URIRef):
return id
except Exception:
pass


def node(g: Graph, *, subject: ISubject, predicate: URIRef) -> Optional[IStringNode]:
default_lang = "en"
items = list(g.objects(subject, predicate))

# Prefer the default language
for item in items:
if isinstance(item, Literal):
if item.language and item.language == default_lang:
return item

# Otherwise, return the first item
for item in items:
if isinstance(item, (URIRef, Literal)):
return item


def string(g: Graph, *, subject: ISubject, predicate: URIRef) -> Optional[str]:
value = node(g, subject=subject, predicate=predicate)
if value:
return str(value)


def integer(g: Graph, *, subject: ISubject, predicate: URIRef) -> Optional[int]:
value = node(g, subject=subject, predicate=predicate)
if value:
try:
return int(value)
except Exception:
pass


def nodes(g: Graph, *, subject: ISubject, predicate: URIRef) -> List[IStringNode]:
return [
item
for item in g.objects(subject, predicate)
if isinstance(item, (URIRef, Literal))
]


def strings(g: Graph, *, subject: ISubject, predicate: URIRef) -> List[str]:
return [str(item) for item in nodes(g, subject=subject, predicate=predicate)]
44 changes: 44 additions & 0 deletions dplib/plugins/dcat/models/namespaces.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from rdflib import Namespace
from rdflib.namespace import FOAF, RDF

ADMS = Namespace("http://www.w3.org/ns/adms#")
DCAT = Namespace("http://www.w3.org/ns/dcat#")
DCT = Namespace("http://purl.org/dc/terms/")
OWL = Namespace("http://www.w3.org/2002/07/owl#")

ACCESS_URL = DCAT.accessURL
ACCURAL_PERIODICITY = DCT.accrualPeriodicity
ALTERNATE_IDENTIFIER = ADMS.identifier
BYTE_SIZE = DCAT.byteSize
COMFORMS_TO = DCT.conformsTo
DATASET = DCAT.Dataset
DESCRIPTION = DCT.description
DISTRIBUTION = DCAT.distribution
DOWNLOAD_URL = DCAT.downloadURL
HAS_VERSION = DCT.hasVersion
HOMEPAGE = FOAF.homepage
IDENTIFIER = DCT.identifier
ISSUED = DCT.issued
IS_VERSION_OF = DCT.isVersionOf
KEYWORD = DCAT.keyword
LANDING_PAGE = DCAT.landingPage
LANGUAGE = DCT.language
LICENSE = DCT.license
MEDIA_TYPE = DCAT.mediaType
MODIFIED = DCT.modified
PAGE = FOAF.page
PROVENANCE = DCT.provenance
RELATED_RESOURCE = DCT.relation
SAMPLE = ADMS.sample
SOURCE = DCT.source
THEME = DCAT.theme
TITLE = DCT.title
TYPE = RDF.type
VERSION = OWL.versionInfo

BINDINGS = {
"adms": ADMS,
"dcat": DCAT,
"dct": DCT,
"owl": OWL,
}
Loading

0 comments on commit 4ad6f9f

Please sign in to comment.