Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup #129

Merged
merged 3 commits into from
May 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions rocrate/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Copyright 2019-2022 The University of Manchester, UK
# Copyright 2020-2022 Vlaams Instituut voor Biotechnologie (VIB), BE
# Copyright 2020-2022 Barcelona Supercomputing Center (BSC), ES
# Copyright 2020-2022 Center for Advanced Studies, Research and Development in Sardinia (CRS4), IT
# Copyright 2022 École Polytechnique Fédérale de Lausanne, CH
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import warnings

from .model.metadata import Metadata, LegacyMetadata


def read_metadata(metadata_path):
"""\
Read an RO-Crate metadata file.

Return a tuple of two elements: the context; a dictionary that maps entity
ids to the entities themselves.
"""
with open(metadata_path) as f:
metadata = json.load(f)
try:
context = metadata['@context']
graph = metadata['@graph']
except KeyError:
raise ValueError(f"{metadata_path} must have a @context and a @graph")
return context, {_["@id"]: _ for _ in graph}


def _check_descriptor(descriptor, entities):
if descriptor["@type"] != "CreativeWork":
raise ValueError('metadata descriptor must be of type "CreativeWork"')
try:
root = entities[descriptor["about"]["@id"]]
except (KeyError, TypeError):
raise ValueError("metadata descriptor does not reference the root entity")
if ("Dataset" not in root["@type"] if isinstance(root["@type"], list) else root["@type"] != "Dataset"):
raise ValueError('root entity must have "Dataset" among its types')
return descriptor["@id"], root["@id"]


def find_root_entity_id(entities):
"""\
Find metadata file descriptor and root data entity.

Expects as input a dictionary that maps JSON entity IDs to the entities
themselves (like the second element returned by read_metadata).

Return a tuple of the corresponding identifiers (descriptor, root).
If the entities are not found, raise KeyError. If they are found,
but they don't satisfy the required constraints, raise ValueError.

In the general case, the metadata file descriptor id can be an
absolute URI whose last path segment is "ro-crate-metadata.json[ld]".
Since there can be more than one such id in the crate, we need to
choose among the corresponding (descriptor, root) entity pairs. First, we
exclude those that don't satisfy other constraints, such as the
descriptor entity being of type CreativeWork, etc.; if this doesn't
leave us with a single pair, we try to pick one with a
heuristic. Suppose we are left with the (m1, r1) and (m2, r2) pairs:
if r1 is the actual root of this crate, then m2 and r2 are regular
files in it, and as such they must appear in r1's hasPart; r2,
however, is not required to have a hasPart property listing other
files. Thus, we look for a pair whose root entity "contains" all
descriptor entities from other pairs. If there is no such pair, or there
is more than one, we just return an arbitrary pair.

"""
descriptor = entities.get(Metadata.BASENAME, entities.get(LegacyMetadata.BASENAME))
if descriptor:
return _check_descriptor(descriptor, entities)
candidates = []
for id_, e in entities.items():
basename = id_.rsplit("/", 1)[-1]
if basename == Metadata.BASENAME or basename == LegacyMetadata.BASENAME:
try:
candidates.append(_check_descriptor(e, entities))
except ValueError:
pass
if not candidates:
raise KeyError("Metadata file descriptor not found")
elif len(candidates) == 1:
return candidates[0]
else:
warnings.warn("Multiple metadata file descriptors, will pick one with a heuristic")
descriptor_ids = set(_[0] for _ in candidates)
for m_id, r_id in candidates:
try:
root = entities[r_id]
part_ids = set(_["@id"] for _ in root["hasPart"])
except KeyError:
continue
if part_ids >= descriptor_ids - {m_id}:
# if True for more than one candidate, this pick is arbitrary
return m_id, r_id
return candidates[0] # fall back to arbitrary pick
116 changes: 1 addition & 115 deletions rocrate/model/contextentity.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,52 +18,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from .. import vocabs
from ..utils import as_list, is_url

from ..utils import is_url
from .entity import Entity


"""
A property class that can be used during class declaration
to make getter/setter properties.

The class name under construction is assumed to be a valid class name
in schema.org as referenced from the RO-Crate JSON-LD context,
and likewise the class properties defined using this
are assumed to be valid schema.org properties.

The setters handle any Entity by picking up their @id instead
of nesting their objects.

Likewise the getter will construct the typed Entity subclass
instead of returning only the identifiers.

The name of the property is provided by the class under construction,
which will call our __set_name__.

The singular getter will always return the first value set (or None),
while the plural versions of the getter return a generator that yields all
values.

So for instance:

class Dataset(Entity):
author = ContextEntity(Person)

dataset = Dataset()

will have both dataset.author that return Person instance,
and dataset.authors, which return generator of Person instances.

The corresponding plural setter supports any iterable (e.g. list):

person1 = Person("#person1", metadata)
person2 = Person("#person2", metadata)
dataset.creators = [person1, person2]
"""


def add_hash(id_):
if id_ is None or "#" in id_ or is_url(id_):
return id_
Expand All @@ -77,75 +35,3 @@ def __init__(self, crate, identifier=None, properties=None):

def format_id(self, identifier):
return add_hash(identifier)

def getmany(self, instance):
for json in as_list(instance.get(self.property)):
# TODO: Support more advanced dispatching
yield self.entity_constructor(json["@id"], instance._metadata)

# def setmany(self, instance, values):
# json = []
# for value in values:
# # TODO: Check it has compatible @type?
# if value._metadata != instance._metadata:
# # Oh no, it might have different base URIs,
# # will need to be added to @graph, reference
# # other objects we don't have etc.
# # TODO: Support setting entities from other RO-Crates
# raise ValueError(
# "Adding entity from other RO-Crate not (yet) supported"
# )
# json.append({"@id": value.id})
# instance[self.property] = flatten(json)

# def __get__(self, instance, owner=None):
# if instance is None:
# return self
# result = None
# for val in self.getmany(instance):
# if result is not None:
# warnings.warn(
# "More than one value in %s.%s, returning first" %
# (self.owner, self.property)
# )
# break
# result = val
# return result

# def __set__(self, instance, value):
# # TODO: Check if arrays are permitted
# self.setmany(instance, as_list(value))

def __delete__(self, instance):
# TODO: Check if permitted to delete?
instance[self.property] = [] # known property, empty in JSON

def __set_name__(self, owner, name):
if not owner.__doc__:
_set_class_doc(owner)
self.owner = owner
self.property = name
uri = vocabs.term_to_uri(name)
doc = vocabs.schema_doc(uri)
self.__doc__ = "Single contextual entity %s\n%s" % (uri, doc)
# Register plural _s variant
# TODO: Register plural _s variants
setattr(owner, name+"s", property(
self.getmany, # self.setmany,
doc="Multiple contextual entities %s\n%s" % (uri, doc))
)
# TODO: Register _ids variants?


def _set_class_doc(Class):
"""
Set class documentation from schema.org definitions
"""
# set the class documentation
try:
# FIXME: avoid this hack here!
uri = vocabs.term_to_uri(Class.__name__)
doc = vocabs.schema_doc(uri)
Class.__doc__ = "Entity %s\n%s" % (uri, doc)
except KeyError:
pass # Non-matching class name, ignore
4 changes: 0 additions & 4 deletions rocrate/model/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,6 @@ def __eq__(self, other):
def type(self):
return self._jsonld['@type']

# @property
# def types(self)-> List[str]:
# return tuple(as_list(self.get("@type", "Thing")))

@property
def datePublished(self):
d = self.get('datePublished')
Expand Down
104 changes: 15 additions & 89 deletions rocrate/rocrate.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#!/usr/bin/env python

# Copyright 2019-2022 The University of Manchester, UK
# Copyright 2020-2022 Vlaams Instituut voor Biotechnologie (VIB), BE
# Copyright 2020-2022 Barcelona Supercomputing Center (BSC), ES
Expand All @@ -19,13 +17,11 @@
# limitations under the License.

import errno
import json
import uuid
import zipfile
import atexit
import shutil
import tempfile
import warnings

from collections import OrderedDict
from pathlib import Path
Expand All @@ -49,23 +45,7 @@
from .model.testsuite import TestSuite

from .utils import is_url, subclasses, get_norm_value, walk


def read_metadata(metadata_path):
"""\
Read an RO-Crate metadata file.

Return a tuple of two elements: the context; a dictionary that maps entity
ids to the entities themselves.
"""
with open(metadata_path) as f:
metadata = json.load(f)
try:
context = metadata['@context']
graph = metadata['@graph']
except KeyError:
raise ValueError(f"{metadata_path} must have a @context and a @graph")
return context, {_["@id"]: _ for _ in graph}
from .metadata import read_metadata, find_root_entity_id


def pick_type(json_entity, type_map, fallback=None):
Expand Down Expand Up @@ -144,71 +124,8 @@ def __read(self, source, gen_preview=False):
self.__read_contextual_entities(entities)
return source

def __check_metadata(self, metadata, entities):
if metadata["@type"] != "CreativeWork":
raise ValueError('metadata descriptor must be of type "CreativeWork"')
try:
root = entities[metadata["about"]["@id"]]
except (KeyError, TypeError):
raise ValueError("metadata descriptor does not reference the root entity")
if ("Dataset" not in root["@type"] if isinstance(root["@type"], list) else root["@type"] != "Dataset"):
raise ValueError('root entity must have "Dataset" among its types')
return metadata["@id"], root["@id"]

def find_root_entity_id(self, entities):
"""\
Find metadata file descriptor and root data entity.

Return a tuple of the corresponding identifiers (metadata, root).
If the entities are not found, raise KeyError. If they are found,
but they don't satisfy the required constraints, raise ValueError.

In the general case, the metadata file descriptor id can be an
absolute URI whose last path segment is "ro-crate-metadata.json[ld]".
Since there can be more than one such id in the crate, we need to
choose among the corresponding (metadata, root) entity pairs. First, we
exclude those that don't satisfy other constraints, such as the
metadata entity being of type CreativeWork, etc.; if this doesn't
leave us with a single pair, we try to pick one with a
heuristic. Suppose we are left with the (m1, r1) and (m2, r2) pairs:
if r1 is the actual root of this crate, then m2 and r2 are regular
files in it, and as such they must appear in r1's hasPart; r2,
however, is not required to have a hasPart property listing other
files. Thus, we look for a pair whose root entity "contains" all
metadata entities from other pairs. If there is no such pair, or there
is more than one, we just return an arbitrary pair.
"""
metadata = entities.get(Metadata.BASENAME, entities.get(LegacyMetadata.BASENAME))
if metadata:
return self.__check_metadata(metadata, entities)
candidates = []
for id_, e in entities.items():
basename = id_.rsplit("/", 1)[-1]
if basename == Metadata.BASENAME or basename == LegacyMetadata.BASENAME:
try:
candidates.append(self.__check_metadata(e, entities))
except ValueError:
pass
if not candidates:
raise KeyError("Metadata file descriptor not found")
elif len(candidates) == 1:
return candidates[0]
else:
warnings.warn("Multiple metadata file descriptors, will pick one with a heuristic")
metadata_ids = set(_[0] for _ in candidates)
for m_id, r_id in candidates:
try:
root = entities[r_id]
part_ids = set(_["@id"] for _ in root["hasPart"])
except KeyError:
continue
if part_ids >= metadata_ids - {m_id}:
# if True for more than one candidate, this pick is arbitrary
return m_id, r_id
return candidates[0] # fall back to arbitrary pick

def __read_data_entities(self, entities, source, gen_preview):
metadata_id, root_id = self.find_root_entity_id(entities)
metadata_id, root_id = find_root_entity_id(entities)
MetadataClass = metadata_class(metadata_id)
metadata_properties = entities.pop(metadata_id)
self.add(MetadataClass(self, metadata_id, properties=metadata_properties))
Expand Down Expand Up @@ -473,10 +390,6 @@ def delete(self, *entities):
pass
self.__entity_map.pop(e.canonical_id(), None)

# TODO
# def fetch_all(self):
# fetch all files defined in the crate

def _copy_unlisted(self, top, base_path):
for root, dirs, files in walk(top, exclude=self.exclude):
root = Path(root)
Expand Down Expand Up @@ -608,3 +521,16 @@ def __validate_suite(self, suite):
if suite is None:
raise ValueError("suite not found")
return suite


def make_workflow_rocrate(workflow_path, wf_type, include_files=[],
fetch_remote=False, cwl=None, diagram=None):
wf_crate = ROCrate()
workflow_path = Path(workflow_path)
wf_crate.add_workflow(
workflow_path, workflow_path.name, fetch_remote=fetch_remote,
main=True, lang=wf_type, gen_cwl=(cwl is None)
)
for file_entry in include_files:
wf_crate.add_file(file_entry)
return wf_crate
Loading