Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add JSON reader & extractor #27

Open
wants to merge 10 commits into
base: develop
Choose a base branch
from
22 changes: 22 additions & 0 deletions ianalyzer_readers/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ def format(self, value):
if value and value not in self.convert_to_none:
return value


class ExternalFile(Extractor):
'''
Free for all external file extractor that provides a stream to `stream_handler`
Expand All @@ -491,6 +492,27 @@ def _apply(self, metadata, *nargs, **kwargs):
return self.stream_handler(open(metadata['associated_file'], 'r'))


class JSON(Extractor):
'''An extractor to extract data from JSON
This extractor assumes that each source is a flat dictionary

Parameters:
keys (Iterable[str]): the keys with which to retrieve a field value from the source
'''

def __init__(self, *keys, **kwargs):
self.keys = list(keys)
super().__init__(**kwargs)

def _apply(self, data: Union[str, dict], key_index: int = 0, **kwargs) -> str:
key = self.keys[key_index]
data = data.get(key)
if len(self.keys) > key_index + 1:
key_index += 1
return self._apply(data, key_index)
return data


class RDF(Extractor):
"""An extractor to extract data from RDF triples

Expand Down
13 changes: 9 additions & 4 deletions ianalyzer_readers/readers/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,23 @@
import logging
import csv

from requests import Response

logging.basicConfig(level=logging.WARNING)
logging.getLogger('ianalyzer-readers').setLevel(logging.DEBUG)
logger = logging.getLogger('ianalyzer-readers').setLevel(logging.DEBUG)

Source = Union[str, Tuple[Union[str, bytes], Dict], bytes]
SourceType = Union[str, Response, bytes]
Source = Union[SourceType, Tuple[SourceType, Dict]]
'''
Type definition for the source input to some Reader methods.

Sources are either:

- a string with the path to a filename
- a tuple containing a path to a filename, and a dictionary with metadata
- binary data with the file contents. This is not supported on all Reader subclasses.
- binary data with the file contents. This is not supported on all Reader subclasses
- a requests.Response
- a tuple of one of the above, and a dictionary with metadata

'''

Document = Dict[str, Any]
Expand Down
73 changes: 73 additions & 0 deletions ianalyzer_readers/readers/json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import json
from os.path import isfile
from typing import Iterable, List, Optional, Union

from pandas import json_normalize
from requests import Response

from .core import Reader, Document, Source
import ianalyzer_readers.extract as extract

class JSONReader(Reader):
"""
A base class for Readers of JSON encoded data.

The reader can either be used on a collection of JSON files, in which each file represents a document,
or for a JSON file containing lists of documents.

If the attributes `record_path` and `meta` are passed, they are used as arguments to [pandas.json_normalize](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) to unnest the JSON data

Attributes:
record_path: a keyword or list of keywords by which a list of documents can be extracted from a large JSON file; do not define if the corpus is structured as one file per document
meta: a list of keywords, or list of lists of keywords, by which metadata for each document can be located
"""

record_path: Optional[List[str]] = None
meta: Optional[List[Union[str, List[str]]]] = None

def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]:
"""
Given a Python dictionary, returns an iterable of extracted documents.

Parameters:
source: the input data

Returns:
list of documents
"""
if type(source) == tuple:
metadata = source[1]
json_data = self._get_json_data(source[0])
else:
metadata = None
json_data = self._get_json_data(source)

if self.record_path and self.meta:
documents = json_normalize(json_data, self.record_path, self.meta).to_dict(
'records'
)
else:
documents = [json_data]

self._reject_extractors(extract.XML, extract.CSV, extract.RDF)

for doc in documents:
field_dict = {
field.name: field.extractor.apply(
doc, metadata=metadata, *nargs, **kwargs
)
for field in self.fields
}

yield field_dict

def _get_json_data(self, source: Source) -> dict:
if isfile(source):
with open(source, "r") as f:
return json.load(f)
elif type(source) == Response:
return source.json()
elif type(source) == bytes:
return json.loads(source)
else:
raise Exception("Unexpected source type for JSON Reader")
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ dependencies = [
"beautifulsoup4",
"lxml",
"openpyxl",
"pandas",
"requests",
"rdflib",
]

Expand Down
22 changes: 21 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
#
beautifulsoup4==4.12.3
# via ianalyzer_readers (setup.py)
certifi==2024.8.30
# via requests
charset-normalizer==3.4.0
# via requests
click==8.1.7
# via
# mkdocs
Expand All @@ -20,6 +24,8 @@ ghp-import==2.1.0
# via mkdocs
griffe==0.42.0
# via mkdocstrings-python
idna==3.10
# via requests
iniconfig==2.0.0
# via pytest
isodate==0.6.1
Expand Down Expand Up @@ -56,12 +62,16 @@ mkdocstrings==0.24.1
# via mkdocstrings-python
mkdocstrings-python==1.9.0
# via ianalyzer_readers (setup.py)
numpy==2.1.3
# via pandas
openpyxl==3.1.2
# via ianalyzer_readers (setup.py)
packaging==24.0
# via
# mkdocs
# pytest
pandas==2.2.3
# via ianalyzer_readers (setup.py)
pathspec==0.12.1
# via mkdocs
platformdirs==4.2.0
Expand All @@ -77,7 +87,11 @@ pyparsing==3.1.2
pytest==8.1.1
# via ianalyzer_readers (setup.py)
python-dateutil==2.9.0.post0
# via ghp-import
# via
# ghp-import
# pandas
pytz==2024.2
# via pandas
pyyaml==6.0.1
# via
# mkdocs
Expand All @@ -87,6 +101,8 @@ pyyaml-env-tag==0.1
# via mkdocs
rdflib==7.0.0
# via ianalyzer_readers (setup.py)
requests==2.32.3
# via ianalyzer_readers (setup.py)
six==1.16.0
# via
# isodate
Expand All @@ -95,5 +111,9 @@ soupsieve==2.5
# via beautifulsoup4
tomli==2.0.1
# via pytest
tzdata==2024.2
# via pandas
urllib3==2.2.3
# via requests
watchdog==4.0.0
# via mkdocs
61 changes: 61 additions & 0 deletions tests/json/data/Macbeth.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{"TITLE":"ACT I",
"SCENE":[
{
"TITLE":"SCENE I. A desert place.",
"STAGEDIR":[
"Thunder and lightning. Enter three Witches",
"Exeunt"
],
"SPEECH":[
{
"SPEAKER":"First Witch",
"LINE":[
"When shall we three meet again",
"In thunder, lightning, or in rain?"
]
},
{
"SPEAKER":"Second Witch",
"LINE":[
"When the hurlyburly's done,",
"When the battle's lost and won."
]
},
{
"SPEAKER":"Third Witch",
"LINE":"That will be ere the set of sun."
},
{
"SPEAKER":"First Witch",
"LINE":"Where the place?"
},
{
"SPEAKER":"Second Witch",
"LINE":"Upon the heath."
},
{
"SPEAKER":"Third Witch",
"LINE":"There to meet with Macbeth."
},
{
"SPEAKER":"First Witch",
"LINE":"I come, Graymalkin!"
},
{
"SPEAKER":"Second Witch",
"LINE":"Paddock calls."
},
{
"SPEAKER":"Third Witch",
"LINE":"Anon."
},
{
"SPEAKER":"ALL",
"LINE":[
"Fair is foul, and foul is fair:",
"Hover through the fog and filthy air."
]
}
]
}]
}
69 changes: 69 additions & 0 deletions tests/json/json_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from glob import glob
import json
import os
from typing import Union

from ianalyzer_readers.extract import JSON
from ianalyzer_readers.readers.core import Field
from ianalyzer_readers.readers.json import JSONReader


def merge_lines(lines: Union[list, str]) -> str:
if isinstance(lines, list):
return "\n".join(lines)
return lines


class JSONDocumentReader(JSONReader):
"""
Example reader that would operate on corpora with one json file per document
"""

data_directory = os.path.join(os.path.dirname(__file__), "data")

def sources(self, **kwargs):
for i in range(1):
data = json.dumps(
{
"TITLE": "ACT I",
"SCENE": {
"TITLE": "SCENE I. A desert place.",
"STAGEDIR": [
"Thunder and lightning. Enter three Witches",
"Exeunt",
],
"SPEECH": {
"SPEAKER": "First Witch",
},
},
}
)
yield data.encode('utf-8')

act = Field("act", JSON("TITLE"))
character = Field("character", JSON("SCENE", "SPEECH", "SPEAKER"))
scene = Field("scene", JSON("SCENE", "TITLE"))

fields = [act, character, scene]


class JSONMultipleDocumentReader(JSONDocumentReader):
"""
Example JSON reader for testing parsing arrays in JSON, using JSON data from https://github.com/tux255/analyzing-shakespeare
"""

def sources(self, **kwargs):
for filename in glob(f"{self.data_directory}/*.json"):
full_path = os.path.join(self.data_directory, filename)
yield full_path

record_path = ["SCENE", "SPEECH"]
meta = ["TITLE", ["SPEECH", "TITLE"], ["SPEECH", "STAGEDIR"]]

act = Field("act", JSON("TITLE"))
scene = Field("scene", JSON("SPEECH.TITLE"))
character = Field("character", JSON("SPEAKER"))
lines = Field("lines", JSON("LINE", transform=merge_lines))
stage_dir = Field("stage_direction", JSON("SPEECH.STAGEDIR", transform=merge_lines))

fields = [act, scene, character, lines, stage_dir]
43 changes: 43 additions & 0 deletions tests/test_json_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from tests.json.json_reader import JSONDocumentReader, JSONMultipleDocumentReader

expected = [
{
'act': 'ACT I',
'scene': 'SCENE I. A desert place.',
'stage_direction': 'Thunder and lightning. Enter three Witches\nExeunt',
'character': 'First Witch',
'lines': 'When shall we three meet again\nIn thunder, lightning, or in rain?',
},
*[{}] * 8,
{
'act': 'ACT I',
'scene': 'SCENE I. A desert place.',
'stage_direction': 'Thunder and lightning. Enter three Witches\nExeunt',
'character': 'ALL',
'lines': "Fair is foul, and foul is fair:\nHover through the fog and filthy air.",
},
]


def test_json_parse_single_document():
reader = JSONDocumentReader()
docs = list(reader.documents())
assert len(docs) == 1
assert docs[0].get('act') == 'ACT I'
assert docs[0].get('character') == 'First Witch'
assert docs[0].get('scene') == 'SCENE I. A desert place.'


def test_json_parse_multiple_documents():
'''test that JSON reader can parse multiple documents from an array in a single file'''
reader = JSONMultipleDocumentReader()
docs = list(reader.documents())
assert len(docs) == len(expected)
_assert_matches(expected[0], docs[0])
_assert_matches(expected[-1], docs[-1])


def _assert_matches(target: dict, doc: dict):
assert len(target.keys()) == len(doc.keys())
for key in target.keys():
assert doc.get(key) == target.get(key)
Loading