Re-applied fixes from testing datadoc on a use case for PINK. (#285)

# Description Re-applied fixes from testing datadoc on a use case for PINK. --------- Co-authored-by: Francesca L. Bleken <48128015+francescalb@users.noreply.github.com>
EMMC-ASBL · Jan 8, 2025 · 624351a · 624351a
1 parent 33f1495
commit 624351a
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 30 deletions.
diff --git a/docs/tools/datadoc.md b/docs/tools/datadoc.md
@@ -48,7 +48,7 @@ Currently, `datadoc` has currently three sub-commands, `add`, `find` and `load`
 
 * The `--backend`, `--base-iri`, `--database` and `--package` options are all for connecting to a triplestore.
 
-* The `--parse`, `--parse-format` and `--prefixes` options are for pre-loading the triplestore with with triples from an external source, like a ntriples or turtle file, and for adding namespace prefixes.
+* The `--parse`, `--parse-format` and `--prefixes` options are for pre-loading the triplestore with triples from an external source, like a ntriples or turtle file, and for adding namespace prefixes.
 They are typically used with the default "rdflib" in-memory backend.
 
 
@@ -282,10 +282,12 @@ options:
   -h, --help            show this help message and exit
   --type TYPE, -t TYPE  Either a resource type (ex: "dataset", "distribution",
                         ...) or the IRI of a class to limit the search to.
-  --criteria KEYWORD=VALUE [KEYWORD=VALUE ...], -c KEYWORD=VALUE [KEYWORD=VALUE ...]
-                        One of more additional matching criteria for resources
-                        to find. Only resources with the given KEYWORD and
-                        VALUE will be matched. The match is exact.
+  --criteria IRI=VALUE, -c IRI=VALUE
+                        Matching criteria for resources to find. The IRI may
+                        be written using a namespace prefix, like
+                        `tcterms:title="My title"`. Currently only exact
+                        matching is supported. This option can be given
+                        multiple times.
   --output FILENAME, -o FILENAME
                         Write matching output to the given file. The default
                         is to write to standard output.
@@ -296,11 +298,11 @@ options:
 ```
 
 The `--type` and `--criteria` options provide search criteria.
-The `--type` option an be any of the recognised [resource types] to limit the search to.
+The `--type` option can be any of the recognised [resource types] to limit the search to.
 Alternatively, it may be the IRI of a class.
 This limits the search to only resources that are individuals of this class.
 
-The `--output` options allows to write the matching output to file instead of standard output.
+The `--output` option allows to write the matching output to file instead of standard output.
 
 The `--format` option controls how the search result should be presented.
 The following formats are currently available:
@@ -337,7 +339,7 @@ The following formats are currently available:
     **Ex 3**: List IRIs of all resources with a given title:
 
     ```shell
-    $ datadoc --parse=kb.ttl find --criteria title="Series of SEM image of cement sample 77600"
+    $ datadoc --parse=kb.ttl find --criteria dcterms:title="Series of SEM image of cement sample 77600"
     https://he-matchmaker.eu/data/sem/SEM_cement_batch2/77600-23-001
     ```
 

diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py
@@ -271,7 +271,9 @@ def test_datadoc():
         SEMDATA["SEM_cement_batch2"],
     }
     assert not named_datasets.difference(datasets)
-    assert set(search_iris(ts, creator="Sigurd Wenner")) == {
+    assert set(
+        search_iris(ts, criterias={"dcterms:creator": "Sigurd Wenner"})
+    ) == {
         SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"],
         SEMDATA["SEM_cement_batch2/77600-23-001"],
         SEMDATA["SEM_cement_batch2"],

diff --git a/tripper/dataset/datadoc.py b/tripper/dataset/datadoc.py
@@ -48,10 +48,10 @@ def subcommand_add(ts, args):
 def subcommand_find(ts, args):
     """Subcommand for finding IRIs in the triplestore."""
     if args.criteria:
-        kwargs = dict(crit.split("=", 1) for crit in args.criteria)
+        criterias = dict(crit.split("=", 1) for crit in args.criteria)
     else:
-        kwargs = {}
-    iris = search_iris(ts, type=args.type, **kwargs)
+        criterias = {}
+    iris = search_iris(ts, type=args.type, criterias=criterias)
 
     # Infer format
     if args.format:
@@ -176,13 +176,13 @@ def main(argv=None):
     parser_find.add_argument(
         "--criteria",
         "-c",
-        action="extend",
-        nargs="+",
-        metavar="KEYWORD=VALUE",
+        action="append",
+        metavar="IRI=VALUE",
         help=(
-            "One of more additional matching criteria for resources to find. "
-            "Only resources with the given KEYWORD and VALUE will be matched. "
-            "The match is exact."
+            "Matching criteria for resources to find. The IRI may be written "
+            'using a namespace prefix, like `tcterms:title="My title"`. '
+            "Currently only exact matching is supported. "
+            "This option can be given multiple times."
         ),
     )
     parser_find.add_argument(
@@ -264,10 +264,12 @@ def main(argv=None):
     parser.add_argument(
         "--prefixes",
         "-P",
-        action="extend",
-        nargs="+",
+        action="append",
         metavar="PREFIX=URL",
-        help="Namespace prefixes to bind to the triplestore.",
+        help=(
+            "Namespace prefixes to bind to the triplestore. "
+            "This option can be given multiple times."
+        ),
     )
 
     args = parser.parse_args(argv)

diff --git a/tripper/dataset/dataset.py b/tripper/dataset/dataset.py
@@ -853,15 +853,21 @@ def get_partial_pipeline(
     return pipeline
 
 
-def search_iris(ts: Triplestore, type=None, **kwargs) -> "List[str]":
+def search_iris(
+    ts: Triplestore, type=None, criterias: "Optional[dict]" = None
+) -> "List[str]":
     """Return a list of IRIs for all matching resources.
     Additional matching criterias can be specified by `kwargs`.
 
     Arguments:
         ts: Triplestore to search.
         type: Either a [resource type] (ex: "dataset", "distribution", ...)
             or the IRI of a class to limit the search to.
-        kwargs: Match criterias.
+        criterias: Match criterias. A dict of IRI, value pairs, where the
+            IRIs refer to data properties on the resource match. The IRIs
+            may use any prefix defined in `ts`. E.g. if the prefix `dcterms`
+            is in `ts`, it is expanded and the match criteria `dcterms:title`
+            is correctly parsed.
 
     Returns:
         List of IRIs for matching resources.
@@ -889,8 +895,15 @@ def search_iris(ts: Triplestore, type=None, **kwargs) -> "List[str]":
     SeeAlso:
     [resource type]: https://emmc-asbl.github.io/tripper/latest/dataset/introduction/#resource-types
     """
+    if criterias is None:
+        criterias = {}
+
     # Special handling of @id
-    id = kwargs.pop("@id") if "@id" in kwargs else kwargs.pop("_id", None)
+    id = (
+        criterias.pop("@id")
+        if "@id" in criterias
+        else criterias.pop("_id", None)
+    )
 
     crit = []
     if type:
@@ -913,11 +926,10 @@ def search_iris(ts: Triplestore, type=None, **kwargs) -> "List[str]":
         crit.append("  ?iri rdf:type ?o .")
 
     expanded = {v: k for k, v in get_shortnames().items()}
-    for k, v in kwargs.items():
+    for k, v in criterias.items():
         key = f"@{k[1:]}" if k.startswith("_") else k
-        if key not in expanded:
-            raise InvalidKeywordError(key)
-        predicate = expanded[key]
+        predicate = ts.expand_iri(key)
+
         if v in expanded:
             value = f"<{expanded[v]}>"
         elif isinstance(v, str):
@@ -927,12 +939,12 @@ def search_iris(ts: Triplestore, type=None, **kwargs) -> "List[str]":
         else:
             value = v
         crit.append(f"      ?iri <{predicate}> {value} .")
-    criterias = "\n".join(crit)
+    where_statements = "\n".join(crit)
     query = f"""
     PREFIX rdf: <{RDF}>
     SELECT DISTINCT ?iri
     WHERE {{
-    {criterias}
+    {where_statements}
     }}
     """
     return [r[0] for r in ts.query(query) if not id or r[0] == id]  # type: ignore