Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Neat 318 add query module to graph #508

Merged
merged 22 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
.PHONY: run-explorer run-tests run-linters build-ui build-python build-docker run-docker compose-up
version="0.81.12"
version="0.82.0"
run-explorer:
@echo "Running explorer API server..."
# open "http://localhost:8000/static/index.html" || true
Expand Down
2 changes: 1 addition & 1 deletion cognite/neat/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.81.12"
__version__ = "0.82.0"
10 changes: 2 additions & 8 deletions cognite/neat/graph/extractors/_mock_graph_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
It is a bit ugly and needs some proper refactoring, but it is not a priority at the moment.
"""

import logging
import random
import warnings
from collections import OrderedDict
Expand Down Expand Up @@ -87,11 +86,11 @@ def generate_triples(
stop_on_exception: bool = False,
allow_isolated_classes: bool = True,
) -> list[Triple]:
"""Generate mock triples based on data model defined transformation rules and desired number
"""Generate mock triples based on data model defined in rules and desired number
of class instances

Args:
transformation_rules : Transformation rules defining the data model
rules : Rules defining the data model
class_count: Target class count for each class in the ontology
stop_on_exception: To stop if exception is encountered or not, default is False
allow_isolated_classes: To allow generation of instances for classes that are not
Expand All @@ -107,11 +106,9 @@ def generate_triples(
if non_existing_classes := set(class_count.keys()) - defined_classes:
msg = f"Class count contains classes {non_existing_classes} for which properties are not defined in Data Model!"
if stop_on_exception:
logging.error(msg)
raise ValueError(msg)
else:
msg += " These classes will be ignored."
logging.warning(msg)
warnings.warn(msg, stacklevel=2)
for class_ in non_existing_classes:
class_count.pop(class_)
Expand Down Expand Up @@ -279,14 +276,12 @@ def _generate_mock_object_property_triples(
if property_definition.value_type not in instance_ids:
msg = f"Class {property_definition.value_type} not found in class count! "
if stop_on_exception:
logging.error(msg)
raise ValueError(msg)
else:
msg += (
f"Skipping creating triples for property {property_definition.name} "
f"of class {class_.suffix} which expects values of this type!"
)
logging.warning(msg)
warnings.warn(msg, stacklevel=2)
return []

Expand Down Expand Up @@ -354,7 +349,6 @@ def _generate_triples_per_class(
)

else:
logging.error(f"Property type {property_.value_type} not supported!")
raise ValueError(f"Property type {property_.value_type} not supported!")

return triples
2 changes: 1 addition & 1 deletion cognite/neat/graph/loaders/_rdf2dms.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def _load(self, stop_on_exception: bool = False) -> Iterable[dm.InstanceApply |
yield from issues
tracker.issue(issues)
class_name = self.class_by_view_id.get(view.as_id(), view.external_id)
triples = self.graph_store.queries.triples_of_type_instances(class_name)
triples = self.graph_store.read(class_name)
for identifier, properties in _triples2dictionary(triples).items():
try:
yield self._create_node(identifier, properties, pydantic_cls, view_id)
Expand Down
3 changes: 3 additions & 0 deletions cognite/neat/graph/queries/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from ._base import Queries

__all__ = ["Queries"]
99 changes: 99 additions & 0 deletions cognite/neat/graph/queries/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import warnings
from typing import cast

from rdflib import RDF, Graph, URIRef
from rdflib.query import ResultRow

from cognite.neat.rules.models.entities import ClassEntity
from cognite.neat.rules.models.information import InformationRules
from cognite.neat.utils.utils import remove_namespace

from ._construct import build_construct_query


class Queries:
"""Helper class for storing standard queries for the graph store."""

def __init__(self, graph: Graph, rules: InformationRules | None = None):
self.graph = graph
self.rules = rules

def list_instances_ids_of_class(self, class_uri: URIRef, limit: int = -1) -> list[URIRef]:
"""Get instances ids for a given class

Args:
class_uri: Class for which instances are to be found
limit: Max number of instances to return, by default -1 meaning all instances

Returns:
List of class instance URIs
"""
query_statement = "SELECT DISTINCT ?subject WHERE { ?subject a <class> .} LIMIT X".replace(
"class", class_uri
).replace("LIMIT X", "" if limit == -1 else f"LIMIT {limit}")
return [cast(tuple, res)[0] for res in list(self.graph.query(query_statement))]

def list_instances_of_type(self, class_uri: URIRef) -> list[ResultRow]:
"""Get all triples for instances of a given class

Args:
class_uri: Class for which instances are to be found

Returns:
List of triples for instances of the given class
"""
query = (
f"SELECT ?instance ?prop ?value "
f"WHERE {{ ?instance rdf:type <{class_uri}> . ?instance ?prop ?value . }} order by ?instance "
)

# Select queries gives an iterable of result rows
return cast(list[ResultRow], list(self.graph.query(query)))

def triples_of_type_instances(self, rdf_type: str) -> list[tuple[str, str, str]]:
"""Get all triples of a given type.

This method assumes the graph has been transformed into the default namespace.
"""

if self.rules:
query = (
f"SELECT ?instance ?prop ?value "
f"WHERE {{ ?instance a <{self.rules.metadata.namespace[rdf_type]}> . ?instance ?prop ?value . }} "
"order by ?instance"
)

result = self.graph.query(query)

# We cannot include the RDF.type in case there is a neat:type property
return [remove_namespace(*triple) for triple in result if triple[1] != RDF.type] # type: ignore[misc, index]
else:
warnings.warn("No rules found for the graph store, returning empty list.", stacklevel=2)
return []

def construct_instances_of_class(self, class_: str, properties_optional: bool = True) -> list[tuple[str, str, str]]:
"""CONSTRUCT instances for a given class from the graph store

Args:
class_: Class entity for which we want to generate query
properties_optional: Whether to make all properties optional, default True

Returns:
List of triples for instances of the given class
"""

if self.rules and (
query := build_construct_query(
ClassEntity(prefix=self.rules.metadata.prefix, suffix=class_),
self.graph,
self.rules,
properties_optional,
)
):
result = self.graph.query(query)

# We cannot include the RDF.type in case there is a neat:type property
return [remove_namespace(*triple) for triple in result if triple[1] != RDF.type] # type: ignore[misc, index]
else:
warnings.warn("No rules found for the graph store, returning empty list.", stacklevel=2)
return []
185 changes: 185 additions & 0 deletions cognite/neat/graph/queries/_construct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import re
from typing import cast

from rdflib import Graph, URIRef

from cognite.neat.rules.analysis import InformationArchitectRulesAnalysis
from cognite.neat.rules.models._rdfpath import (
AllReferences,
Hop,
RDFPath,
SingleProperty,
Traversal,
)
from cognite.neat.rules.models.entities import ClassEntity
from cognite.neat.rules.models.information import InformationProperty, InformationRules
from cognite.neat.utils.utils import most_occurring_element

from ._shared import Triple, hop2property_path

_QUERY_TEMPLATE = """CONSTRUCT {{ {graph_template} }}
WHERE {{ {graph_pattern}
{filter}
}}"""


def build_construct_query(
class_: ClassEntity,
graph: Graph,
rules: InformationRules,
properties_optional: bool = True,
class_instances: list[URIRef] | None = None,
) -> str | None:
"""Builds a CONSTRUCT query for a given class and rules and optionally filters by class instances.

Args:
class_ : The class entity for which the query is generated.
graph : The graph containing instances of classes.
rules : The information rules to use for query generation.
properties_optional : Whether to make all properties optional. Defaults to True.
class_instances : List of class instances to filter by. Defaults to None (no filter, return all instances).

Returns:
str: CONSTRUCT query.

!!! note "On CONSTRUCT Query"
CONSTRUCT query is composed of two parts: graph template and graph pattern.
Graph template is used the shape of instance acquired using graph pattern.
This allows us to create a new graph with the new shape without actually modifying
the original graph, or creating new instances.

The CONSTRUCT query is far less forgiving than the SELECT query. It will not return
anything if one of the properties that define the "shape" of the class instance is missing.
This is the reason why there is an option to make all properties optional, so that
the query will return all instances that have at least one property defined.
"""
if (
transformations := InformationArchitectRulesAnalysis(rules)
.class_property_pairs(only_rdfpath=True, consider_inheritance=True)
.get(class_, None)
):
templates, patterns = to_construct_triples(
graph, list(transformations.values()), rules.prefixes, properties_optional
)

return _QUERY_TEMPLATE.format(
graph_template="\n".join(triples2sparql_statement(templates)),
graph_pattern="\n".join(triples2sparql_statement(patterns)),
filter="" if not class_instances else add_filter(class_instances),
)

else:
return None


def add_filter(class_instances: list[URIRef]):
class_instances_formatted = [f"<{instance}>" for instance in class_instances]
return f"FILTER (?instance IN ({', '.join(class_instances_formatted)}))"


def to_construct_triples(
graph: Graph, transformations: list[InformationProperty], prefixes: dict, properties_optional: bool = True
) -> tuple[list[Triple], list[Triple]]:
"""Converts transformations of a class to CONSTRUCT triples which are used to generate CONSTRUCT query

Args:
graph: Graph containing instances of classes (used for property inference for hops)
transformations : List of transformations to use to form triples
prefixes : Dictionary of prefixes for namespaces
properties_optional : Flag indicating if properties should be optional. Defaults to True.

Returns:
tuple: Tuple of triples that define graph template and graph pattern parts of CONSTRUCT query


!!! note "Purely inherited transformations"
Assumption that neat makes is that in case of purely inherited transformations
we will type instance with class to which transformation belongs to.

Otherwise we will type instance with class that is most occurring in non-inherited
transformations.

"""
# TODO: Add handling of UNIONs in rules

templates = []
patterns = []
non_inherited_starting_rdf_types = []

for transformation in transformations:
traversal = cast(RDFPath, transformation.transformation).traversal

# keeping track of starting rdf types of non-inherited transformations/properties
if isinstance(traversal, Traversal) and not transformation.inherited:
non_inherited_starting_rdf_types.append(traversal.class_.id)

graph_template_triple = Triple(
subject="?instance",
predicate=f"{transformation.class_.prefix}:{transformation.property_}",
object=f'?{re.sub(r"[^_a-zA-Z0-9/_]", "_", str(transformation.property_).lower())}',
optional=False,
)
templates.append(graph_template_triple)

# use case AllReferences: binding instance to certain rdf property
if isinstance(traversal, AllReferences):
graph_pattern_triple = Triple(
subject="BIND(?instance", predicate="AS", object=f"{graph_template_triple.object})", optional=False
)

# use case SingleProperty: simple property traversal
elif isinstance(traversal, SingleProperty):
graph_pattern_triple = Triple(
subject=graph_template_triple.subject,
predicate=traversal.property.id,
object=graph_template_triple.object,
optional=True if properties_optional else not transformation.is_mandatory,
)

# use case Hop: property traversal with multiple hops turned into property path
# see: https://www.oxfordsemantic.tech/faqs/what-is-a-property-path
elif isinstance(traversal, Hop):
graph_pattern_triple = Triple(
subject="?instance",
predicate=hop2property_path(graph, traversal, prefixes),
object=graph_template_triple.object,
optional=True if properties_optional else not transformation.is_mandatory,
)

# other type of rdfpaths are skipped
else:
continue

patterns.append(graph_pattern_triple)

# Add first triple for graph pattern stating type of object
# we use most occurring here to pull out most occurring rdf type of the starting
# node of the transformation, or the class itself to which the transformation is
# defined for.
# This is safeguard in case there are multiple classes in the graph pattern
patterns.insert(
0,
Triple(
subject="?instance",
predicate="a",
object=(
most_occurring_element(non_inherited_starting_rdf_types)
if non_inherited_starting_rdf_types
else str(transformation.class_)
),
optional=False,
),
)

return templates, patterns


def triples2sparql_statement(triples: list[Triple]):
return [
(
f"OPTIONAL {{ {triple.subject} {triple.predicate} {triple.object} . }}"
if triple.optional
else f"{triple.subject} {triple.predicate} {triple.object} ."
)
for triple in triples
]
Loading
Loading