Skip to content

Commit

Permalink
feat: scripts to store and load matching occurrences
Browse files Browse the repository at this point in the history
add script to put all the contenttype_id/object_id of instances where db
and xml are matching in one file

add script to load all the matching id's and tag them using a collection
  • Loading branch information
b1rger committed Jan 23, 2024
1 parent 2f91675 commit 1ae9e93
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 0 deletions.
20 changes: 20 additions & 0 deletions apis_ontology/management/commands/tag_matching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import json
import pathlib

from django.core.management.base import BaseCommand
from django.contrib.contenttypes.models import ContentType
from apis_core.collections.models import SkosCollection, SkosCollectionContentObject


class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument("--path", type=pathlib.Path)

def handle(self, *args, **options):
if options["path"]:
sc, _ = SkosCollection.objects.get_or_create(name="20240123 - xml and db match")
data = json.loads(options["path"].read_text())
for key, entry in data.items():
print(key)
content_type = ContentType.objects.get(pk=entry["content_type_id"])
SkosCollectionContentObject.objects.get_or_create(collection = sc, content_type=content_type, object_id=entry["object_id"])
87 changes: 87 additions & 0 deletions apis_ontology/management/commands/xml_collection_ok.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import json
import xml.etree.ElementTree as ET
import pathlib
import unidecode
from django.core.management.base import BaseCommand

from apis_ontology.models import Source

ns = {'b': 'http://www.biographien.ac.at'}

DATA = {}


def filename_clean(file):
if "_online_resolved" in file.name:
return file.name.replace("_online_resolved", "")
if "__resolved" in file.name:
return file.name.replace("__resolved", "")
if "_resolved" in file.name:
return file.name.replace("_resolved", "")
return file.name


def transliterate_v1(text: str) -> str:
return unidecode.unidecode(text)


def equals_database_entry(file):
root = ET.parse(file).getroot()
filename = filename_clean(file)
sources = []
pubinfo = root.find("./b:Lexikonartikel/b:PubInfo", ns)
if pubinfo is not None:
sources = Source.objects.filter(orig_filename=filename, pubinfo=pubinfo.text)
lieferung = root.find("./b:Lexikonartikel/b:Lieferung", ns)
if lieferung is not None:
sources = Source.objects.filter(orig_filename=filename, pubinfo=lieferung.text)
if len(sources) == 1:
haupttext = root.find("./b:Lexikonartikel/b:Haupttext", ns)
if haupttext:
haupttext = ''.join(haupttext.itertext())
else:
haupttext = getattr(haupttext, "text", "") or ""
kurzdefinition = root.find("./b:Lexikonartikel/b:Kurzdefinition", ns)
if kurzdefinition:
kurzdefinition = ''.join(kurzdefinition.itertext())
else:
kurzdefinition = getattr(kurzdefinition, "text", "") or ""

db_haupttext = ""
if sources[0].content_object.oebl_haupttext is not None:
db_haupttext = sources[0].content_object.oebl_haupttext.text

db_kurzinfo = ""
if sources[0].content_object.oebl_kurzinfo is not None:
db_kurzinfo = sources[0].content_object.oebl_kurzinfo.text

if (
transliterate_v1(haupttext.strip()) == transliterate_v1(db_haupttext.strip()) and
transliterate_v1(kurzdefinition.strip()) == transliterate_v1(db_kurzinfo.strip())):
DATA[sources[0].id] = {"content_type_id": sources[0].content_type_id, "object_id": sources[0].object_id}
return True
if len(sources) > 1:
print(f"{file} equals multiple entries")
return False


class Command(BaseCommand):
help = "Import data from legacy xml files"

def add_arguments(self, parser):
# point to XML_RESOLVE_IN_PROGRESS folder
parser.add_argument("--path", type=pathlib.Path)

def handle(self, *args, **options):
files = []
if options["path"]:
if options["path"].is_dir():
for file in options["path"].glob('**/*.xml'):
files.append(file)
else:
files = [options["path"]]

for file in sorted(files):
print(file)
equals_database_entry(file)
pathlib.Path("output.json").write_text(json.dumps(DATA, indent=2))

0 comments on commit 1ae9e93

Please sign in to comment.