Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support batch validation #20

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 97 additions & 52 deletions entityshape/__init__.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,116 @@
import asyncio
import logging
import re
from typing import Any, Dict, Optional
from re import Pattern
from typing import Any, Dict, List

import aiohttp
import requests
from pydantic import BaseModel
from rich.console import Console

from entityshape.exceptions import ApiError, EidError, LangError, QidError
from entityshape.models.compareshape import CompareShape
from entityshape.models.result import Result
from entityshape.models.shape import Shape
from entityshape.exceptions import (
ApiError,
EidError,
EntityIdError,
LangError,
NoEntitySchemaDataError,
WikibaseEntitySchemaDownloadError,
)
from entityshape.models.entity import Entity

console = Console()
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

class EntityShape(BaseModel):
"""This class models the entityshape API
It has a default timeout of 10 seconds

The API currently only support items"""
class EntityShape(BaseModel):
"""Downloads and validates Wikidata entities"""

entity_id: str = "" # item or lexeme
eid: str = "" # entityshape
entity_ids: List[str]
eid: str # entityshape
lang: str = "en" # language defaults to English
result: Result = Result()
eid_regex = re.compile(r"E\d+")
entity_id_regex = re.compile(r"[QL]\d+")
compare_shape_result: Optional[Dict[str, Any]] = None
eid_regex: Pattern = re.compile(r"E\d+")
wikibase_url: str = "http://www.wikidata.org"
mediawiki_api_url: str = "https://www.wikidata.org/w/api.php"
user_agent: str = "entityshape (https://github.com/dpriskorn/entityshape)"
entities: List[Entity] = []
entity_schema_data: Dict[str, Any] = {}

def __check_inputs__(self):
if not self.lang:
raise LangError("We only support 2 and 3 letter language codes")
if not self.eid:
raise EidError("We need an entityshape EID")
if not 2 <= len(self.lang) <= 3:
raise LangError("Language code is not correct length")
if not re.match(self.eid_regex, self.eid):
raise EidError("EID has to be E followed by only numbers like this: E100")
if not self.entity_id:
raise QidError("We need an item QID")
if not re.match(self.entity_id_regex, self.entity_id):
raise QidError("QID has to be Q followed by only numbers like this: Q100")
if not self.entity_ids:
raise EntityIdError("We need entity ids")
# if not re.match(self.entity_id_regex, self.entity_id):
# raise QidError("QID has to be Q followed by only numbers like this: Q100")

def validate_and_get_result(self) -> Result:
"""This method checks if we got the 3 parameters we need and
gets the results and return them"""
self.__check_inputs__()
self.__validate__()
return self.__parse_result__()
def download_and_validate(self):
self.__check_inputs__() # Check if inputs are valid
self.download_schema()
if not self.entity_schema_data:
raise NoEntitySchemaDataError("Got no entity schema data from Wikidata")
with console.status("Downloading entity json"):
loop = asyncio.get_event_loop()
loop.run_until_complete(self.__download_json__())
print(f"Downloaded {len(self.entities)} entities")
if self.entities:
with console.status("Validating entities"):
[entity.check_and_validate() for entity in self.entities]
print("Validation finished")
else:
print("No entities to validate")

def __validate__(self):
shape: Shape = Shape(self.eid, self.lang)
comparison: CompareShape = CompareShape(
shape.get_schema_shape(),
self.entity_id,
self.lang,
wikibase_url=self.wikibase_url,
mediawiki_api_url=self.mediawiki_api_url,
)
self.compare_shape_result = {}
self.compare_shape_result = {
"general": comparison.get_general(),
"properties": comparison.get_properties(),
"statements": comparison.get_statements(),
}
async def __download_json__(self) -> None:
"""Get all the JSON data we need asynchronously"""
logger.debug("__download_json__: running")
async with aiohttp.ClientSession() as session:
# TODO add user agent
# Create tasks for downloading JSON data for each entity_id
tasks = [
self._get_entity_json(entity_id, session)
for entity_id in self.entity_ids
]

def __parse_result__(self) -> Result:
if self.compare_shape_result:
self.result = Result(**self.compare_shape_result)
self.result.lang = self.lang
self.result.analyze()
return self.result
else:
return Result()
# Gather and wait for all tasks to complete
await asyncio.gather(*tasks)
# self.json_responses = await asyncio.gather(*tasks)
# Handle results as needed
# We don't handle the results for now.

async def _get_entity_json(self, entity_id: str, session) -> None:
"""
Downloads the entity from Wikidata asynchronously
"""
logger.debug("_get_entity_json: running")
url = f"{self.wikibase_url}/wiki/Special:EntityData/{entity_id}.json"

async with session.get(url) as response:
if response.status == 200:
entity_data = await response.json()
self.entities.append(
Entity(
entity_id=entity_id,
entity_data=entity_data,
eid=self.eid,
entity_schema_data=self.entity_schema_data,
)
)
else:
raise WikibaseEntitySchemaDownloadError(
f"Got {response.status} from {url}. "
f"Please check that the configuration is correct"
)

def download_schema(self):
"""
Downloads the schema from wikidata
"""
url: str = f"https://www.wikidata.org/wiki/EntitySchema:{self.eid}?action=raw"
# todo add user agent
response = requests.get(url)
if response.status_code == 404:
raise WikibaseEntitySchemaDownloadError()
self.entity_schema_data: dict = response.json()
10 changes: 9 additions & 1 deletion entityshape/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ class LangError(BaseException):
pass


class QidError(BaseException):
class EntityIdError(BaseException):
pass


Expand All @@ -20,3 +20,11 @@ class WikibaseEntitySchemaDownloadError(BaseException):

class WikibasePropertiesDownloadError(BaseException):
pass


class NoEntitySchemaDataError(BaseException):
pass


class MissingInformationError(BaseException):
pass
114 changes: 59 additions & 55 deletions entityshape/models/compareshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,19 @@
Copyright 2021 Mark Tully
Compares a json shape from shape.py with wikidata json
"""
import logging
from typing import Any, Dict

import requests
from requests import Response

from entityshape.exceptions import (
MissingInformationError,
WikibaseEntitySchemaDownloadError,
WikibasePropertiesDownloadError,
)

logger = logging.getLogger(__name__)


class CompareShape:
"""
Expand All @@ -29,20 +34,24 @@ def __init__(
self,
shape: dict,
entity: str,
entity_data: Dict[str, Any],
language: str,
mediawiki_api_url: str,
wikibase_url: str,
):
self._entity: str = entity
self.entity_data = entity_data
self._shape: dict = shape
self.mediawiki_api_url = mediawiki_api_url
self.wikibase_url = wikibase_url
self._property_responses: dict = {}

self._get_entity_json()
if self._entities["entities"][self._entity]:
self._get_props(self._entities["entities"][self._entity]["claims"])
self._get_property_names(language)
# self._get_entity_json()
if not self.entity_data:
raise MissingInformationError()
if self.entity_data["entities"][self._entity]:
self._get_props(self.entity_data["entities"][self._entity]["claims"])
# self._get_property_names(language)
self._compare_statements()
self._compare_properties()

Expand All @@ -68,9 +77,12 @@ def get_general(self) -> dict:
general: dict = {}
properties: list = ["lexicalCategory", "language"]
for item in properties:
if item in self._shape and item in self._entities["entities"][self._entity]:
if (
item in self._shape
and item in self.entity_data["entities"][self._entity]
):
expected: list = self._shape[item]["allowed"]
actual: str = self._entities["entities"][self._entity][item]
actual: str = self.entity_data["entities"][self._entity][item]
general[item] = "incorrect"
if actual in expected:
general[item] = "correct"
Expand All @@ -81,7 +93,7 @@ def _compare_statements(self):
Compares the statements in the entity to the schema
"""
statements: dict = {}
claims: dict = self._entities["entities"][self._entity]["claims"]
claims: dict = self.entity_data["entities"][self._entity]["claims"]
for claim in claims:
statement_results: list = []
property_statement_results: list = []
Expand Down Expand Up @@ -112,10 +124,12 @@ def _compare_properties(self):
properties: dict = {}
for claim in self._props:
response: str = "missing"
child: dict = {"name": self._names[claim], "necessity": "absent"}
# Disable use of _names because it slows the validation down
# child: dict = {"name": self._names[claim], "necessity": "absent"}
child: dict = {"name": "unknown", "necessity": "absent"}
if claim in self._shape and "necessity" in self._shape[claim]:
child["necessity"] = self._shape[claim]["necessity"]
if claim in self._entities["entities"][self._entity]["claims"]:
if claim in self.entity_data["entities"][self._entity]["claims"]:
response = self._process_claim(claim, child)
if response != "":
child["response"] = response
Expand Down Expand Up @@ -167,20 +181,6 @@ def _assess_cardinality(self, claim, child):
cardinality = "not enough correct statements"
return cardinality

def _get_entity_json(self):
"""
Downloads the entity from wikidata
"""
url: str = f"{self.wikibase_url}/wiki/Special:EntityData/{self._entity}.json"
response: Response = requests.get(url)
if response.status_code == 200:
self._entities = response.json()
else:
raise WikibaseEntitySchemaDownloadError(
f"Got {response.status_code} from {url}. "
f"Please check that the configuration is correct"
)

def _get_props(self, claims: dict):
"""
Gets a list of properties included in the entity
Expand All @@ -194,37 +194,39 @@ def _get_props(self, claims: dict):
if claim not in self._props and claim.startswith("P"):
self._props.append(claim)

def _get_property_names(self, language: str):
"""
Gets the names of properties from wikidata
"""
self._names: dict = {}
wikidata_property_list: list = [
self._props[i * 49 : (i + 1) * 49]
for i in range((len(self._props) + 48) // 48)
]
for element in wikidata_property_list:
required_properties: str = "|".join(element)
url: str = (
f"{self.mediawiki_api_url}?action=wbgetentities&ids="
f"{required_properties}&props=labels&languages={language}&format=json"
)
response: Response = requests.get(url)
if response.status_code == 200:
json_text: dict = response.json()
print(url)
# print(json_text)
else:
raise WikibasePropertiesDownloadError(
f"Got {response.status_code} from {url}"
)
for item in element:
try:
self._names[json_text["entities"][item]["id"]] = json_text[
"entities"
][item]["labels"][language]["value"]
except KeyError:
self._names[json_text["entities"][item]["id"]] = ""
# def _get_property_names(self, language: str):
# """
# Gets the names of properties from wikidata
# """
# # Optimize this
# self._names: dict = {}
# # What is the purpose of this?
# wikidata_property_list: list = [
# self._props[i * 49 : (i + 1) * 49]
# for i in range((len(self._props) + 48) // 48)
# ]
# for element in wikidata_property_list:
# required_properties: str = "|".join(element)
# url: str = (
# f"{self.mediawiki_api_url}?action=wbgetentities&ids="
# f"{required_properties}&props=labels&languages={language}&format=json"
# )
# response: Response = requests.get(url)
# if response.status_code == 200:
# json_text: dict = response.json()
# print(url)
# # print(json_text)
# else:
# raise WikibasePropertiesDownloadError(
# f"Got {response.status_code} from {url}"
# )
# for item in element:
# try:
# self._names[json_text["entities"][item]["id"]] = json_text[
# "entities"
# ][item]["labels"][language]["value"]
# except KeyError:
# self._names[json_text["entities"][item]["id"]] = ""

def _process_allowed_in_shape_claim(self, claim, param):
allowed = "correct"
Expand All @@ -246,10 +248,12 @@ def _process_required_in_shape_claim(shape_claim, datavalue):
required_value: str = shape_claim["required"][required_property][0]

query_entity: str = datavalue["value"]["id"]
# Why is this fetch needed?
url: str = (
f"https://www.wikidata.org/w/api.php?action=wbgetclaims"
f"&entity={query_entity}&property={required_property}&format=json"
)
logger.debug("Fetching: {url}")
response: Response = requests.get(url)
if response.status_code == 200:
json_text: dict = response.json()
Expand Down
Loading
Loading