Skip to content

Commit

Permalink
Re-applied fixes from testing datadoc on a use case for PINK. (#285)
Browse files Browse the repository at this point in the history
# Description
Re-applied fixes from testing datadoc on a use case for PINK.

---------

Co-authored-by: Francesca L. Bleken <48128015+francescalb@users.noreply.github.com>
  • Loading branch information
jesper-friis and francescalb authored Jan 8, 2025
1 parent 33f1495 commit 624351a
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 30 deletions.
18 changes: 10 additions & 8 deletions docs/tools/datadoc.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ Currently, `datadoc` has currently three sub-commands, `add`, `find` and `load`

* The `--backend`, `--base-iri`, `--database` and `--package` options are all for connecting to a triplestore.

* The `--parse`, `--parse-format` and `--prefixes` options are for pre-loading the triplestore with with triples from an external source, like a ntriples or turtle file, and for adding namespace prefixes.
* The `--parse`, `--parse-format` and `--prefixes` options are for pre-loading the triplestore with triples from an external source, like a ntriples or turtle file, and for adding namespace prefixes.
They are typically used with the default "rdflib" in-memory backend.


Expand Down Expand Up @@ -282,10 +282,12 @@ options:
-h, --help show this help message and exit
--type TYPE, -t TYPE Either a resource type (ex: "dataset", "distribution",
...) or the IRI of a class to limit the search to.
--criteria KEYWORD=VALUE [KEYWORD=VALUE ...], -c KEYWORD=VALUE [KEYWORD=VALUE ...]
One of more additional matching criteria for resources
to find. Only resources with the given KEYWORD and
VALUE will be matched. The match is exact.
--criteria IRI=VALUE, -c IRI=VALUE
Matching criteria for resources to find. The IRI may
be written using a namespace prefix, like
`tcterms:title="My title"`. Currently only exact
matching is supported. This option can be given
multiple times.
--output FILENAME, -o FILENAME
Write matching output to the given file. The default
is to write to standard output.
Expand All @@ -296,11 +298,11 @@ options:
```

The `--type` and `--criteria` options provide search criteria.
The `--type` option an be any of the recognised [resource types] to limit the search to.
The `--type` option can be any of the recognised [resource types] to limit the search to.
Alternatively, it may be the IRI of a class.
This limits the search to only resources that are individuals of this class.

The `--output` options allows to write the matching output to file instead of standard output.
The `--output` option allows to write the matching output to file instead of standard output.

The `--format` option controls how the search result should be presented.
The following formats are currently available:
Expand Down Expand Up @@ -337,7 +339,7 @@ The following formats are currently available:
**Ex 3**: List IRIs of all resources with a given title:

```shell
$ datadoc --parse=kb.ttl find --criteria title="Series of SEM image of cement sample 77600"
$ datadoc --parse=kb.ttl find --criteria dcterms:title="Series of SEM image of cement sample 77600"
https://he-matchmaker.eu/data/sem/SEM_cement_batch2/77600-23-001
```

Expand Down
4 changes: 3 additions & 1 deletion tests/dataset/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,9 @@ def test_datadoc():
SEMDATA["SEM_cement_batch2"],
}
assert not named_datasets.difference(datasets)
assert set(search_iris(ts, creator="Sigurd Wenner")) == {
assert set(
search_iris(ts, criterias={"dcterms:creator": "Sigurd Wenner"})
) == {
SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"],
SEMDATA["SEM_cement_batch2/77600-23-001"],
SEMDATA["SEM_cement_batch2"],
Expand Down
26 changes: 14 additions & 12 deletions tripper/dataset/datadoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ def subcommand_add(ts, args):
def subcommand_find(ts, args):
"""Subcommand for finding IRIs in the triplestore."""
if args.criteria:
kwargs = dict(crit.split("=", 1) for crit in args.criteria)
criterias = dict(crit.split("=", 1) for crit in args.criteria)
else:
kwargs = {}
iris = search_iris(ts, type=args.type, **kwargs)
criterias = {}
iris = search_iris(ts, type=args.type, criterias=criterias)

# Infer format
if args.format:
Expand Down Expand Up @@ -176,13 +176,13 @@ def main(argv=None):
parser_find.add_argument(
"--criteria",
"-c",
action="extend",
nargs="+",
metavar="KEYWORD=VALUE",
action="append",
metavar="IRI=VALUE",
help=(
"One of more additional matching criteria for resources to find. "
"Only resources with the given KEYWORD and VALUE will be matched. "
"The match is exact."
"Matching criteria for resources to find. The IRI may be written "
'using a namespace prefix, like `tcterms:title="My title"`. '
"Currently only exact matching is supported. "
"This option can be given multiple times."
),
)
parser_find.add_argument(
Expand Down Expand Up @@ -264,10 +264,12 @@ def main(argv=None):
parser.add_argument(
"--prefixes",
"-P",
action="extend",
nargs="+",
action="append",
metavar="PREFIX=URL",
help="Namespace prefixes to bind to the triplestore.",
help=(
"Namespace prefixes to bind to the triplestore. "
"This option can be given multiple times."
),
)

args = parser.parse_args(argv)
Expand Down
30 changes: 21 additions & 9 deletions tripper/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,15 +853,21 @@ def get_partial_pipeline(
return pipeline


def search_iris(ts: Triplestore, type=None, **kwargs) -> "List[str]":
def search_iris(
ts: Triplestore, type=None, criterias: "Optional[dict]" = None
) -> "List[str]":
"""Return a list of IRIs for all matching resources.
Additional matching criterias can be specified by `kwargs`.
Arguments:
ts: Triplestore to search.
type: Either a [resource type] (ex: "dataset", "distribution", ...)
or the IRI of a class to limit the search to.
kwargs: Match criterias.
criterias: Match criterias. A dict of IRI, value pairs, where the
IRIs refer to data properties on the resource match. The IRIs
may use any prefix defined in `ts`. E.g. if the prefix `dcterms`
is in `ts`, it is expanded and the match criteria `dcterms:title`
is correctly parsed.
Returns:
List of IRIs for matching resources.
Expand Down Expand Up @@ -889,8 +895,15 @@ def search_iris(ts: Triplestore, type=None, **kwargs) -> "List[str]":
SeeAlso:
[resource type]: https://emmc-asbl.github.io/tripper/latest/dataset/introduction/#resource-types
"""
if criterias is None:
criterias = {}

# Special handling of @id
id = kwargs.pop("@id") if "@id" in kwargs else kwargs.pop("_id", None)
id = (
criterias.pop("@id")
if "@id" in criterias
else criterias.pop("_id", None)
)

crit = []
if type:
Expand All @@ -913,11 +926,10 @@ def search_iris(ts: Triplestore, type=None, **kwargs) -> "List[str]":
crit.append(" ?iri rdf:type ?o .")

expanded = {v: k for k, v in get_shortnames().items()}
for k, v in kwargs.items():
for k, v in criterias.items():
key = f"@{k[1:]}" if k.startswith("_") else k
if key not in expanded:
raise InvalidKeywordError(key)
predicate = expanded[key]
predicate = ts.expand_iri(key)

if v in expanded:
value = f"<{expanded[v]}>"
elif isinstance(v, str):
Expand All @@ -927,12 +939,12 @@ def search_iris(ts: Triplestore, type=None, **kwargs) -> "List[str]":
else:
value = v
crit.append(f" ?iri <{predicate}> {value} .")
criterias = "\n".join(crit)
where_statements = "\n".join(crit)
query = f"""
PREFIX rdf: <{RDF}>
SELECT DISTINCT ?iri
WHERE {{
{criterias}
{where_statements}
}}
"""
return [r[0] for r in ts.query(query) if not id or r[0] == id] # type: ignore

0 comments on commit 624351a

Please sign in to comment.