load

#!/usr/bin/env python3

# This utility loads into a part of the UK legislation dataset.  The
# dataset consists of an index file in JSON format.  Each entry in
# the index describes a piece of legislation, including date, title,
# a description if known.  There is also a filename to the text
# from the legislation.  The text files in the dataset are mostly there,
# but the text extraction from the legislation database is semi-complete.

# TrustGraph API client
import trustgraph.api as tg

# Utilities to help construct document 'external metadata'
from trustgraph.knowledge import DigitalDocument, Organization
from trustgraph.knowledge import PublicationEvent, hash, to_uri
from trustgraph.knowledge import PREF_PUBEV, PREF_ORG, PREF_DOC

import json
from urllib.parse import urljoin
import io
import sys

# TrustGraph API client.  By default URL is localhost:8088
cli = tg.Api()

# Loads a single item of legislation into TrustGraph for processing
def load(item, year):

    # Open the filename and read legislation text
    with open(f"data/{year}/{item['filename']}") as f:
        text = f.read()

    # Construct an organisation object
    org_id = to_uri(PREF_ORG, hash(item["publisher"]))
    org = Organization(
        id = org_id,
        name = item["publisher"],
    )

    # Construct a publication event object
    pubev_id = to_uri(PREF_PUBEV, hash(item["identifier"]))
    pubev = PublicationEvent(
        id = pubev_id,
        description = "Publication of " + item["title"],
        start_date = item["date"],
        end_date = item["date"],
        organization = org,
    )

    # Construct a document object
    doc_id = to_uri(PREF_DOC, hash(item["identifier"]))
    doc = DigitalDocument(
        id = doc_id,
        name = item["title"],
        description = item.get("description", None),
        publication = pubev,
        url = item["identifier"],
    )

    # Use the API to load the text with metadata objects
    cli.load_text(text.encode("utf-8"), doc_id, doc)

    print("Loaded", item["title"], "->", doc_id)

# Handles a year's worth of data
def load_year(year):

    with open(f"data/{year}/index.json") as f:
        ix = json.load(f)
    
    # Loop over index
    for item in ix:
        load(item, year)

# Arguments define years to handle
for year in sys.argv[1:]:
    load_year(year)