ResearchObject · simleo · May 30, 2022 · May 30, 2022 · May 30, 2022 · May 30, 2022
diff --git a/rocrate/metadata.py b/rocrate/metadata.py
@@ -0,0 +1,108 @@
+# Copyright 2019-2022 The University of Manchester, UK
+# Copyright 2020-2022 Vlaams Instituut voor Biotechnologie (VIB), BE
+# Copyright 2020-2022 Barcelona Supercomputing Center (BSC), ES
+# Copyright 2020-2022 Center for Advanced Studies, Research and Development in Sardinia (CRS4), IT
+# Copyright 2022 École Polytechnique Fédérale de Lausanne, CH
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import warnings
+
+from .model.metadata import Metadata, LegacyMetadata
+
+
+def read_metadata(metadata_path):
+    """\
+    Read an RO-Crate metadata file.
+
+    Return a tuple of two elements: the context; a dictionary that maps entity
+    ids to the entities themselves.
+    """
+    with open(metadata_path) as f:
+        metadata = json.load(f)
+    try:
+        context = metadata['@context']
+        graph = metadata['@graph']
+    except KeyError:
+        raise ValueError(f"{metadata_path} must have a @context and a @graph")
+    return context, {_["@id"]: _ for _ in graph}
+
+
+def _check_descriptor(descriptor, entities):
+    if descriptor["@type"] != "CreativeWork":
+        raise ValueError('metadata descriptor must be of type "CreativeWork"')
+    try:
+        root = entities[descriptor["about"]["@id"]]
+    except (KeyError, TypeError):
+        raise ValueError("metadata descriptor does not reference the root entity")
+    if ("Dataset" not in root["@type"] if isinstance(root["@type"], list) else root["@type"] != "Dataset"):
+        raise ValueError('root entity must have "Dataset" among its types')
+    return descriptor["@id"], root["@id"]
+
+
+def find_root_entity_id(entities):
+    """\
+    Find metadata file descriptor and root data entity.
+
+    Expects as input a dictionary that maps JSON entity IDs to the entities
+    themselves (like the second element returned by read_metadata).
+
+    Return a tuple of the corresponding identifiers (descriptor, root).
+    If the entities are not found, raise KeyError. If they are found,
+    but they don't satisfy the required constraints, raise ValueError.
+
+    In the general case, the metadata file descriptor id can be an
+    absolute URI whose last path segment is "ro-crate-metadata.json[ld]".
+    Since there can be more than one such id in the crate, we need to
+    choose among the corresponding (descriptor, root) entity pairs. First, we
+    exclude those that don't satisfy other constraints, such as the
+    descriptor entity being of type CreativeWork, etc.; if this doesn't
+    leave us with a single pair, we try to pick one with a
+    heuristic. Suppose we are left with the (m1, r1) and (m2, r2) pairs:
+    if r1 is the actual root of this crate, then m2 and r2 are regular
+    files in it, and as such they must appear in r1's hasPart; r2,
+    however, is not required to have a hasPart property listing other
+    files. Thus, we look for a pair whose root entity "contains" all
+    descriptor entities from other pairs. If there is no such pair, or there
+    is more than one, we just return an arbitrary pair.
+
+    """
+    descriptor = entities.get(Metadata.BASENAME, entities.get(LegacyMetadata.BASENAME))
+    if descriptor:
+        return _check_descriptor(descriptor, entities)
+    candidates = []
+    for id_, e in entities.items():
+        basename = id_.rsplit("/", 1)[-1]
+        if basename == Metadata.BASENAME or basename == LegacyMetadata.BASENAME:
+            try:
+                candidates.append(_check_descriptor(e, entities))
+            except ValueError:
+                pass
+    if not candidates:
+        raise KeyError("Metadata file descriptor not found")
+    elif len(candidates) == 1:
+        return candidates[0]
+    else:
+        warnings.warn("Multiple metadata file descriptors, will pick one with a heuristic")
+        descriptor_ids = set(_[0] for _ in candidates)
+        for m_id, r_id in candidates:
+            try:
+                root = entities[r_id]
+                part_ids = set(_["@id"] for _ in root["hasPart"])
+            except KeyError:
+                continue
+            if part_ids >= descriptor_ids - {m_id}:
+                # if True for more than one candidate, this pick is arbitrary
+                return m_id, r_id
+        return candidates[0]  # fall back to arbitrary pick
diff --git a/rocrate/model/contextentity.py b/rocrate/model/contextentity.py
@@ -18,52 +18,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .. import vocabs
-from ..utils import as_list, is_url
-
+from ..utils import is_url
 from .entity import Entity
 
 
-"""
-A property class that can be used during class declaration
-to make getter/setter properties.
-
-The class name under construction is assumed to be a valid class name
-in schema.org as referenced from the RO-Crate JSON-LD context,
-and likewise the class properties defined using this
-are assumed to be valid schema.org properties.
-
-The setters handle any Entity by picking up their @id instead
-of nesting their objects.
-
-Likewise the getter will construct the typed Entity subclass
-instead of returning only the identifiers.
-
-The name of the property is provided by the class under construction,
-which will call our __set_name__.
-
-The singular getter will always return the first value set (or None),
-while the plural versions of the getter return a generator that yields all
-values.
-
-So for instance:
-
-    class Dataset(Entity):
-        author = ContextEntity(Person)
-
-    dataset = Dataset()
-
-will have both dataset.author that return Person instance,
-and dataset.authors, which return generator of Person instances.
-
-The corresponding plural setter supports any iterable (e.g. list):
-
-    person1 = Person("#person1", metadata)
-    person2 = Person("#person2", metadata)
-    dataset.creators = [person1, person2]
-"""
-
-
 def add_hash(id_):
     if id_ is None or "#" in id_ or is_url(id_):
         return id_
@@ -77,75 +35,3 @@ def __init__(self, crate, identifier=None, properties=None):
 
     def format_id(self, identifier):
         return add_hash(identifier)
-
-    def getmany(self, instance):
-        for json in as_list(instance.get(self.property)):
-            # TODO: Support more advanced dispatching
-            yield self.entity_constructor(json["@id"], instance._metadata)
-
-    # def setmany(self, instance, values):
-    #     json = []
-    #     for value in values:
-    #         # TODO: Check it has compatible @type?
-    #         if value._metadata != instance._metadata:
-    #             # Oh no, it might have different base URIs,
-    #             # will need to be added to @graph, reference
-    #             # other objects we don't have etc.
-    #             # TODO: Support setting entities from other RO-Crates
-    #             raise ValueError(
-    #                 "Adding entity from other RO-Crate not (yet) supported"
-    #             )
-    #         json.append({"@id": value.id})
-    #     instance[self.property] = flatten(json)
-
-    # def __get__(self, instance, owner=None):
-    #     if instance is None:
-    #         return self
-    #     result = None
-    #     for val in self.getmany(instance):
-    #         if result is not None:
-    #             warnings.warn(
-    #                 "More than one value in %s.%s, returning first" %
-    #                 (self.owner, self.property)
-    #             )
-    #             break
-    #         result = val
-    #     return result
-
-    # def __set__(self, instance, value):
-    #     # TODO: Check if arrays are permitted
-    #     self.setmany(instance, as_list(value))
-
-    def __delete__(self, instance):
-        # TODO: Check if permitted to delete?
-        instance[self.property] = []  # known property, empty in JSON
-
-    def __set_name__(self, owner, name):
-        if not owner.__doc__:
-            _set_class_doc(owner)
-        self.owner = owner
-        self.property = name
-        uri = vocabs.term_to_uri(name)
-        doc = vocabs.schema_doc(uri)
-        self.__doc__ = "Single contextual entity %s\n%s" % (uri, doc)
-        # Register plural _s variant
-        # TODO: Register plural _s variants
-        setattr(owner, name+"s", property(
-            self.getmany,  # self.setmany,
-            doc="Multiple contextual entities %s\n%s" % (uri, doc))
-        )
-        # TODO: Register _ids variants?
-
-
-def _set_class_doc(Class):
-    """
-    Set class documentation from schema.org definitions
-    """
-    # set the class documentation
-    try:
-        # FIXME: avoid this hack here!
-        uri = vocabs.term_to_uri(Class.__name__)
-        doc = vocabs.schema_doc(uri)
-        Class.__doc__ = "Entity %s\n%s" % (uri, doc)
-    except KeyError:
-        pass  # Non-matching class name, ignore
diff --git a/rocrate/model/entity.py b/rocrate/model/entity.py
@@ -131,10 +131,6 @@ def __eq__(self, other):
     def type(self):
         return self._jsonld['@type']
 
-    # @property
-    # def types(self)-> List[str]:
-        # return tuple(as_list(self.get("@type", "Thing")))
-
     @property
     def datePublished(self):
         d = self.get('datePublished')

diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python
-
 # Copyright 2019-2022 The University of Manchester, UK
 # Copyright 2020-2022 Vlaams Instituut voor Biotechnologie (VIB), BE
 # Copyright 2020-2022 Barcelona Supercomputing Center (BSC), ES
@@ -19,13 +17,11 @@
 # limitations under the License.
 
 import errno
-import json
 import uuid
 import zipfile
 import atexit
 import shutil
 import tempfile
-import warnings
 
 from collections import OrderedDict
 from pathlib import Path
@@ -49,23 +45,7 @@
 from .model.testsuite import TestSuite
 
 from .utils import is_url, subclasses, get_norm_value, walk
-
-
-def read_metadata(metadata_path):
-    """\
-    Read an RO-Crate metadata file.
-
-    Return a tuple of two elements: the context; a dictionary that maps entity
-    ids to the entities themselves.
-    """
-    with open(metadata_path) as f:
-        metadata = json.load(f)
-    try:
-        context = metadata['@context']
-        graph = metadata['@graph']
-    except KeyError:
-        raise ValueError(f"{metadata_path} must have a @context and a @graph")
-    return context, {_["@id"]: _ for _ in graph}
+from .metadata import read_metadata, find_root_entity_id
 
 
 def pick_type(json_entity, type_map, fallback=None):
@@ -144,71 +124,8 @@ def __read(self, source, gen_preview=False):
         self.__read_contextual_entities(entities)
         return source
 
-    def __check_metadata(self, metadata, entities):
-        if metadata["@type"] != "CreativeWork":
-            raise ValueError('metadata descriptor must be of type "CreativeWork"')
-        try:
-            root = entities[metadata["about"]["@id"]]
-        except (KeyError, TypeError):
-            raise ValueError("metadata descriptor does not reference the root entity")
-        if ("Dataset" not in root["@type"] if isinstance(root["@type"], list) else root["@type"] != "Dataset"):
-            raise ValueError('root entity must have "Dataset" among its types')
-        return metadata["@id"], root["@id"]
-
-    def find_root_entity_id(self, entities):
-        """\
-        Find metadata file descriptor and root data entity.
-
-        Return a tuple of the corresponding identifiers (metadata, root).
-        If the entities are not found, raise KeyError. If they are found,
-        but they don't satisfy the required constraints, raise ValueError.
-
-        In the general case, the metadata file descriptor id can be an
-        absolute URI whose last path segment is "ro-crate-metadata.json[ld]".
-        Since there can be more than one such id in the crate, we need to
-        choose among the corresponding (metadata, root) entity pairs. First, we
-        exclude those that don't satisfy other constraints, such as the
-        metadata entity being of type CreativeWork, etc.; if this doesn't
-        leave us with a single pair, we try to pick one with a
-        heuristic. Suppose we are left with the (m1, r1) and (m2, r2) pairs:
-        if r1 is the actual root of this crate, then m2 and r2 are regular
-        files in it, and as such they must appear in r1's hasPart; r2,
-        however, is not required to have a hasPart property listing other
-        files. Thus, we look for a pair whose root entity "contains" all
-        metadata entities from other pairs. If there is no such pair, or there
-        is more than one, we just return an arbitrary pair.
-        """
-        metadata = entities.get(Metadata.BASENAME, entities.get(LegacyMetadata.BASENAME))
-        if metadata:
-            return self.__check_metadata(metadata, entities)
-        candidates = []
-        for id_, e in entities.items():
-            basename = id_.rsplit("/", 1)[-1]
-            if basename == Metadata.BASENAME or basename == LegacyMetadata.BASENAME:
-                try:
-                    candidates.append(self.__check_metadata(e, entities))
-                except ValueError:
-                    pass
-        if not candidates:
-            raise KeyError("Metadata file descriptor not found")
-        elif len(candidates) == 1:
-            return candidates[0]
-        else:
-            warnings.warn("Multiple metadata file descriptors, will pick one with a heuristic")
-            metadata_ids = set(_[0] for _ in candidates)
-            for m_id, r_id in candidates:
-                try:
-                    root = entities[r_id]
-                    part_ids = set(_["@id"] for _ in root["hasPart"])
-                except KeyError:
-                    continue
-                if part_ids >= metadata_ids - {m_id}:
-                    # if True for more than one candidate, this pick is arbitrary
-                    return m_id, r_id
-            return candidates[0]  # fall back to arbitrary pick
-
     def __read_data_entities(self, entities, source, gen_preview):
-        metadata_id, root_id = self.find_root_entity_id(entities)
+        metadata_id, root_id = find_root_entity_id(entities)
         MetadataClass = metadata_class(metadata_id)
         metadata_properties = entities.pop(metadata_id)
         self.add(MetadataClass(self, metadata_id, properties=metadata_properties))
@@ -473,10 +390,6 @@ def delete(self, *entities):
                     pass
             self.__entity_map.pop(e.canonical_id(), None)
 
-    # TODO
-    # def fetch_all(self):
-        # fetch all files defined in the crate
-
     def _copy_unlisted(self, top, base_path):
         for root, dirs, files in walk(top, exclude=self.exclude):
             root = Path(root)
@@ -608,3 +521,16 @@ def __validate_suite(self, suite):
             if suite is None:
                 raise ValueError("suite not found")
         return suite
+
+
+def make_workflow_rocrate(workflow_path, wf_type, include_files=[],
+                          fetch_remote=False, cwl=None, diagram=None):
+    wf_crate = ROCrate()
+    workflow_path = Path(workflow_path)
+    wf_crate.add_workflow(
+        workflow_path, workflow_path.name, fetch_remote=fetch_remote,
+        main=True, lang=wf_type, gen_cwl=(cwl is None)
+    )
+    for file_entry in include_files:
+        wf_crate.add_file(file_entry)
+    return wf_crate