From 0b72047c92b4fb5c60a0084ed1182bae711d4201 Mon Sep 17 00:00:00 2001 From: Binh Vu Date: Sun, 24 Mar 2024 21:11:34 +0000 Subject: [PATCH] fix missing properties domains/ranges (drop support for python 3.9) --- CHANGELOG.md | 7 ++- kgdata/dbpedia/datasets/properties.py | 8 +-- kgdata/models/ont_class.py | 13 +--- kgdata/models/ont_property.py | 42 ++++++------- kgdata/wikidata/models/wdentity.py | 13 +--- kgdata/wikidata/models/wdproperty.py | 86 ++++++++++++++++++++++++--- pyproject.toml | 4 +- scripts/build.sh | 40 ++++++------- 8 files changed, 131 insertions(+), 82 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 498a4be..86c4dd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # CHANGE LOG -## [Unreleased] +## [7.0.0] (2024-03-24) ### Added @@ -11,6 +11,11 @@ ### Changed - Reuse code: `GenericDB.get_default_props` now calls `ont_property.get_default_props`. +- Drop support for Python 3.9 to use new features in dataclass + +### Fixed + +- Fix domains/ranges of ontology properties ## [6.5.2] (2024-03-08) diff --git a/kgdata/dbpedia/datasets/properties.py b/kgdata/dbpedia/datasets/properties.py index fab731c..92e9578 100644 --- a/kgdata/dbpedia/datasets/properties.py +++ b/kgdata/dbpedia/datasets/properties.py @@ -4,9 +4,6 @@ from functools import partial from urllib.parse import urlparse -from rdflib import OWL, RDF, RDFS, BNode, Literal, URIRef -from sm.misc.funcs import assert_not_null - from kgdata.dataset import Dataset from kgdata.db import deser_from_dict, ser_to_dict from kgdata.dbpedia.config import DBpediaDirCfg @@ -15,6 +12,8 @@ from kgdata.models.multilingual import MultiLingualString, MultiLingualStringList from kgdata.models.ont_property import OntologyProperty from kgdata.splitter import split_a_list +from rdflib import OWL, RDF, RDFS, BNode, Literal, URIRef +from sm.misc.funcs import assert_not_null rdf_type = str(RDF.type) rdfs_label = str(RDFS.label) @@ -76,7 +75,8 @@ def to_prop(resource: RDFResource, default_lang: str = "en") -> OntologyProperty equivalent_properties=[ str(term) for term in resource.props.get(str(OWL.equivalentProperty), []) ], - subjects=[str(term) for term in resource.props.get(str(RDFS.domain), [])], + domains=[str(term) for term in resource.props.get(str(RDFS.domain), [])], + ranges=[str(term) for term in resource.props.get(str(RDFS.range), [])], inverse_properties=[], instanceof=[str(term) for term in resource.props.get(rdf_type, [])], ancestors={}, diff --git a/kgdata/models/ont_class.py b/kgdata/models/ont_class.py index be95d5e..566030b 100644 --- a/kgdata/models/ont_class.py +++ b/kgdata/models/ont_class.py @@ -7,19 +7,8 @@ from rdflib import OWL, RDFS -@dataclass +@dataclass(kw_only=True, slots=True) class OntologyClass: - __slots__ = ( - "id", - "label", - "description", - "aliases", - "parents", - "properties", - "different_froms", - "equivalent_classes", - "ancestors", - ) id: str label: MultiLingualString description: MultiLingualString diff --git a/kgdata/models/ont_property.py b/kgdata/models/ont_property.py index 7134c36..44d6de4 100644 --- a/kgdata/models/ont_property.py +++ b/kgdata/models/ont_property.py @@ -1,40 +1,30 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Mapping +from typing import Mapping, Optional from kgdata.models.multilingual import MultiLingualString, MultiLingualStringList from rdflib import RDF, RDFS, XSD -@dataclass +@dataclass(kw_only=True, slots=True) class OntologyProperty: - __slots__ = ( - "id", - "label", - "description", - "aliases", - "datatype", - "parents", - "related_properties", - "equivalent_properties", - "subjects", - "inverse_properties", - "instanceof", - "ancestors", - ) id: str label: MultiLingualString description: MultiLingualString aliases: MultiLingualStringList datatype: str + instanceof: list[str] parents: list[str] + ancestors: dict[str, int] + inverse_properties: list[str] related_properties: list[str] equivalent_properties: list[str] - subjects: list[str] - inverse_properties: list[str] - instanceof: list[str] - ancestors: dict[str, int] + + # domains + domains: Optional[list[str]] + # ranges + ranges: Optional[list[str]] @staticmethod def empty(id: str): @@ -47,7 +37,8 @@ def empty(id: str): parents=[], related_properties=[], equivalent_properties=[], - subjects=[], + domains=None, + ranges=None, inverse_properties=[], instanceof=[], ancestors={}, @@ -95,7 +86,8 @@ def to_dict(self): "parents": self.parents, "related_properties": self.related_properties, "equivalent_properties": self.equivalent_properties, - "subjects": self.subjects, + "domains": self.domains, + "ranges": self.ranges, "inverse_properties": self.inverse_properties, "instanceof": self.instanceof, "ancestors": self.ancestors, @@ -119,7 +111,8 @@ def get_default_props() -> list[OntologyProperty]: parents=[], related_properties=[], equivalent_properties=[], - subjects=[], + domains=None, + ranges=None, inverse_properties=[], instanceof=[], ancestors={}, @@ -135,7 +128,8 @@ def get_default_props() -> list[OntologyProperty]: parents=[], related_properties=[], equivalent_properties=[], - subjects=[], + domains=None, + ranges=None, inverse_properties=[], instanceof=[str(RDF.Property)], ancestors={}, diff --git a/kgdata/wikidata/models/wdentity.py b/kgdata/wikidata/models/wdentity.py index febcfc7..0a8c57a 100644 --- a/kgdata/wikidata/models/wdentity.py +++ b/kgdata/wikidata/models/wdentity.py @@ -8,19 +8,8 @@ from kgdata.wikidata.models.wdvalue import WDValue -@dataclass +@dataclass(slots=True) class WDEntity: - __slots__ = ( - "id", - "type", - "datatype", - "label", - "description", - "aliases", - "props", - "sitelinks", - ) - id: str # possible values ["item", "property"] type: Literal["item", "property"] diff --git a/kgdata/wikidata/models/wdproperty.py b/kgdata/wikidata/models/wdproperty.py index c3bcf6a..1a8207b 100644 --- a/kgdata/wikidata/models/wdproperty.py +++ b/kgdata/wikidata/models/wdproperty.py @@ -3,8 +3,10 @@ from dataclasses import dataclass from typing import Literal, Mapping +from kgdata.models.multilingual import MultiLingualString, MultiLingualStringList from kgdata.models.ont_property import OntologyProperty from kgdata.wikidata.models.wdentity import WDEntity +from kgdata.wikidata.models.wdstatement import WDStatement # wikibase-lexeme, monolingualtext, wikibase-sense, url, wikibase-property, # wikibase-form, external-id, time, commonsMedia, quantity, wikibase-item, musical-notation, @@ -30,9 +32,10 @@ ] -@dataclass +@dataclass(kw_only=True, slots=True) class WDProperty(OntologyProperty): datatype: WDDataType + constraints: list[WDStatement] @staticmethod def from_entity(ent: WDEntity): @@ -56,10 +59,49 @@ def from_entity(ent: WDEntity): else: assert False, f"Unknown type: {stmt.value.to_dict()}" - subjects = [] - for stmt in ent.props.get("P1629", []): - assert stmt.value.is_entity_id(stmt.value) - subjects.append(stmt.value.as_entity_id()) + constraints = ent.props.get("P2302", []) + domains = None + ranges = None + for stmt in constraints: + entid = stmt.value.as_entity_id_safe() + # subject type constraint + if entid == "Q21503250": + try: + # domains so it must have class -- if not, it's bad and we can ignore + if "P2308" not in stmt.qualifiers: + continue + # and the relation must be instanceof or (instanceof or subclassof), or subclassof + assert "P2309" in stmt.qualifiers, (ent.id, stmt) + relations = [ + x.as_entity_id_safe() for x in stmt.qualifiers["P2309"] + ] + for relation in relations: + assert relation in ["Q21503252", "Q30208840", "Q21514624"], ( + ent.id, + stmt, + ) + except: + continue + domains = [x.as_entity_id_safe() for x in stmt.qualifiers["P2308"]] + + # value-type constraint + if entid == "Q21510865": + try: + # if ranges are classes + assert "P2308" in stmt.qualifiers, (ent.id, stmt) + assert "P2309" in stmt.qualifiers, (ent.id, stmt) + # and the relation must be instanceof or (instanceof or subclassof), or subclassof + relations = [ + x.as_entity_id_safe() for x in stmt.qualifiers["P2309"] + ] + for relation in relations: + assert relation in ["Q21503252", "Q30208840", "Q21514624"], ( + ent.id, + stmt, + ) + except: + continue + ranges = [x.as_entity_id_safe() for x in stmt.qualifiers["P2308"]] inverse_properties = [] for stmt in ent.props.get("P1696", []): @@ -80,10 +122,12 @@ def from_entity(ent: WDEntity): parents=sorted(parents), related_properties=sorted(related_properties), equivalent_properties=sorted(equivalent_properties), - subjects=sorted(subjects), + domains=domains, + ranges=ranges, inverse_properties=sorted(inverse_properties), instanceof=sorted(instanceof), ancestors={}, + constraints=constraints, ) def is_object_property(self): @@ -110,7 +154,8 @@ def to_base(self): parents=self.parents, related_properties=self.related_properties, equivalent_properties=self.equivalent_properties, - subjects=self.subjects, + domains=self.domains, + ranges=self.ranges, inverse_properties=self.inverse_properties, instanceof=self.instanceof, ancestors=self.ancestors, @@ -119,6 +164,33 @@ def to_base(self): def __str__(self): return f"{self.label} ({self.id})" + def to_dict(self): + return { + "id": self.id, + "label": self.label.to_dict(), + "description": self.description.to_dict(), + "datatype": self.datatype, + "aliases": self.aliases.to_dict(), + "parents": self.parents, + "related_properties": self.related_properties, + "equivalent_properties": self.equivalent_properties, + "domains": self.domains, + "ranges": self.ranges, + "inverse_properties": self.inverse_properties, + "instanceof": self.instanceof, + "ancestors": self.ancestors, + "constraints": [s.to_dict() for s in self.constraints], + } + + @classmethod + def from_dict(cls, obj): + obj["label"] = MultiLingualString(**obj["label"]) + obj["description"] = MultiLingualString(**obj["description"]) + obj["aliases"] = MultiLingualStringList(**obj["aliases"]) + obj["ancestors"] = obj["ancestors"] + obj["constraints"] = [WDStatement.from_dict(x) for x in obj["constraints"]] + return cls(**obj) + def normalize_wikidata_datatype(datatype: WDDataType) -> str: if datatype == "wikibase-property" or datatype == "wikibase-item": diff --git a/pyproject.toml b/pyproject.toml index e99a739..77b4e13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "kgdata" -version = "6.5.2" +version = "7.0.0" description = "Library to process dumps of knowledge graphs (Wikipedia, DBpedia, Wikidata)" readme = "README.md" authors = [{ name = "Binh Vu", email = "binh@toan2.com" }] @@ -10,7 +10,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", ] -requires-python = ">=3.9" +requires-python = ">=3.10" dependencies = [ 'orjson >= 3.9.0, < 4.0.0', diff --git a/scripts/build.sh b/scripts/build.sh index f9e7bd3..27a86d3 100644 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -36,18 +36,18 @@ function wikidata_db { # dbpedia_dataset generic_extractor_dump # dbpedia_dataset mapping_extractor_dump -dbpedia_dataset ontology_dump -dbpedia_dataset classes -dbpedia_dataset properties -dbpedia_dataset entities -dbpedia_dataset entity_redirections -dbpedia_dataset entity_labels -dbpedia_dataset entity_metadata -dbpedia_dataset entity_all_types -dbpedia_dataset entity_degrees -dbpedia_dataset entity_types_and_degrees -dbpedia_dataset meta_graph -dbpedia_dataset meta_graph_stats +# dbpedia_dataset ontology_dump +# dbpedia_dataset classes +# dbpedia_dataset properties +# dbpedia_dataset entities +# dbpedia_dataset entity_redirections +# dbpedia_dataset entity_labels +# dbpedia_dataset entity_metadata +# dbpedia_dataset entity_all_types +# dbpedia_dataset entity_degrees +# dbpedia_dataset entity_types_and_degrees +# dbpedia_dataset meta_graph +# dbpedia_dataset meta_graph_stats # ====================================================================== # WIKIDATA Datasets @@ -65,7 +65,7 @@ dbpedia_dataset meta_graph_stats # wikidata_dataset entity_types # wikidata_dataset classes -# wikidata_dataset properties +wikidata_dataset properties # wikidata_dataset class_count # wikidata_dataset property_count @@ -103,18 +103,18 @@ dbpedia_dataset meta_graph_stats # ====================================================================== # DBpedia Databases -dbpedia_db classes -dbpedia_db properties -dbpedia_db entities -dbpedia_db entity_labels -dbpedia_db entity_metadata -dbpedia_db entity_redirections +# dbpedia_db classes +# dbpedia_db properties +# dbpedia_db entities +# dbpedia_db entity_labels +# dbpedia_db entity_metadata +# dbpedia_db entity_redirections # ====================================================================== # WIKIDATA Databases # wikidata_db classes -# wikidata_db properties +wikidata_db properties # wikidata_db entities # wikidata_db entity_labels # wikidata_db entity_metadata