From 26307bd5fc1424e62d788b4397f8502a19ff80d0 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 29 Aug 2024 18:44:00 +0200 Subject: [PATCH 01/13] feat: add JSON extractor and reader --- ianalyzer_readers/extract.py | 16 +++++++++++ ianalyzer_readers/readers/json.py | 45 +++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 ianalyzer_readers/readers/json.py diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py index fe4e53b..57ca5de 100644 --- a/ianalyzer_readers/extract.py +++ b/ianalyzer_readers/extract.py @@ -467,6 +467,7 @@ def format(self, value): if value and value not in self.convert_to_none: return value + class ExternalFile(Extractor): ''' Free for all external file extractor that provides a stream to `stream_handler` @@ -491,6 +492,21 @@ def _apply(self, metadata, *nargs, **kwargs): return self.stream_handler(open(metadata['associated_file'], 'r')) +class JSON(Extractor): + ''' An extractor to extract data from JSON + This extractor assumes that each source is a flat dictionary + + Parameters: + key: the key with which to retrieve a field from the source + ''' + def __init__(self, key: str, *nargs, **kwargs): + self.key = key + super().__init__(*nargs, **kwargs) + + def _apply(self, data, **kwargs): + return data.get(self.key) + + class RDF(Extractor): ''' An extractor to extract data from RDF triples diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py new file mode 100644 index 0000000..8b79c99 --- /dev/null +++ b/ianalyzer_readers/readers/json.py @@ -0,0 +1,45 @@ +import json +from os.path import isfile +import requests +from typing import Iterable + +from .core import Reader, Document, Source +import ianalyzer_readers.extract as extract + +class JSONReader(Reader): + ''' + A base class for Readers of JSON encoded data. + ''' + + def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]: + ''' + Given a Python dictionary, returns an iterable of extracted documents. + + Parameters: + source: a + ''' + if type(source) == tuple: + metadata = source[1] + json_data = self._get_json_data(source[0]) + else: + metadata = None + json_data = self._get_json_data(source) + self._reject_extractors(extract.XML, extract.CSV, extract.RDF) + + field_dict = { + field.name: field.extractor.apply( + json_data, metadata=metadata, *nargs, **kwargs + ) + for field in self.fields + } + + yield field_dict + + def _get_json_data(self, source: Source) -> dict: + if type(source) == bytes: + return source + elif isfile(source): + return json.load(source) + else: + response = requests.get(source) + return response.json() From 8ffd412c5aacf0c855489cebbade4901a5722eb6 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 19 Sep 2024 14:36:49 +0200 Subject: [PATCH 02/13] allow Response as type for Source --- ianalyzer_readers/readers/core.py | 16 +++++++++++----- ianalyzer_readers/readers/json.py | 14 ++++++++------ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/ianalyzer_readers/readers/core.py b/ianalyzer_readers/readers/core.py index 16904a7..a7fb303 100644 --- a/ianalyzer_readers/readers/core.py +++ b/ianalyzer_readers/readers/core.py @@ -11,18 +11,24 @@ from typing import List, Iterable, Dict, Any, Union, Tuple import logging +from requests import Response + logger = logging.getLogger() -Source = Union[str, Tuple[str, Dict], bytes] -''' +SourceType = Union[str, Response, bytes] +Source = Union[SourceType, Tuple[SourceType, Dict]] + +""" Type definition for the source input to some Reader methods. Sources are either: - a string with the path to a filename -- a tuple containing a path to a filename, and a dictionary with metadata -- binary data with the file contents. This is not supported on all Reader subclasses. -''' +- binary data with the file contents. This is not supported on all Reader subclasses +- a requests.Response +- a tuple of one of the above, and a dictionary with metadata + +""" Document = Dict[str, Any] ''' diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py index 8b79c99..2a3c28e 100644 --- a/ianalyzer_readers/readers/json.py +++ b/ianalyzer_readers/readers/json.py @@ -1,8 +1,9 @@ import json from os.path import isfile -import requests from typing import Iterable +from requests import Response + from .core import Reader, Document, Source import ianalyzer_readers.extract as extract @@ -36,10 +37,11 @@ def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]: yield field_dict def _get_json_data(self, source: Source) -> dict: - if type(source) == bytes: - return source - elif isfile(source): + if isfile(source): return json.load(source) + elif type(source) == Response: + return source.json() + elif type(source) == bytes: + return json.loads(source) else: - response = requests.get(source) - return response.json() + raise Exception("Unexpected source type for JSON Reader") From 1907919ff26e6e3f44c2577ed8ffbd62a95a2790 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 26 Sep 2024 10:16:55 +0200 Subject: [PATCH 03/13] add unit test --- ianalyzer_readers/extract.py | 14 +++++-- ianalyzer_readers/readers/json.py | 3 +- tests/json/data/Macbeth.json | 61 +++++++++++++++++++++++++++++++ tests/json/json_reader.py | 33 +++++++++++++++++ tests/test_json_reader.py | 22 +++++++++++ 5 files changed, 128 insertions(+), 5 deletions(-) create mode 100644 tests/json/data/Macbeth.json create mode 100644 tests/json/json_reader.py create mode 100644 tests/test_json_reader.py diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py index 57ca5de..ba3f860 100644 --- a/ianalyzer_readers/extract.py +++ b/ianalyzer_readers/extract.py @@ -499,12 +499,18 @@ class JSON(Extractor): Parameters: key: the key with which to retrieve a field from the source ''' - def __init__(self, key: str, *nargs, **kwargs): - self.key = key - super().__init__(*nargs, **kwargs) + def __init__(self, *keys: Iterable[str], **kwargs): + self.keys = list(keys) + super().__init__(**kwargs) def _apply(self, data, **kwargs): - return data.get(self.key) + while self.keys: + key = self.keys.pop(0) + try: + data = data.get(key) + except AttributeError: + data = data[0].get(key) + return data class RDF(Extractor): diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py index 2a3c28e..6d6e7b4 100644 --- a/ianalyzer_readers/readers/json.py +++ b/ianalyzer_readers/readers/json.py @@ -38,7 +38,8 @@ def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]: def _get_json_data(self, source: Source) -> dict: if isfile(source): - return json.load(source) + with open(source, "r") as f: + return json.load(f) elif type(source) == Response: return source.json() elif type(source) == bytes: diff --git a/tests/json/data/Macbeth.json b/tests/json/data/Macbeth.json new file mode 100644 index 0000000..b290e09 --- /dev/null +++ b/tests/json/data/Macbeth.json @@ -0,0 +1,61 @@ +{"TITLE":"ACT I", +"SCENE":[ + { + "TITLE":"SCENE I. A desert place.", + "STAGEDIR":[ + "Thunder and lightning. Enter three Witches", + "Exeunt" + ], + "SPEECH":[ + { + "SPEAKER":"First Witch", + "LINE":[ + "When shall we three meet again", + "In thunder, lightning, or in rain?" + ] + }, + { + "SPEAKER":"Second Witch", + "LINE":[ + "When the hurlyburly's done,", + "When the battle's lost and won." + ] + }, + { + "SPEAKER":"Third Witch", + "LINE":"That will be ere the set of sun." + }, + { + "SPEAKER":"First Witch", + "LINE":"Where the place?" + }, + { + "SPEAKER":"Second Witch", + "LINE":"Upon the heath." + }, + { + "SPEAKER":"Third Witch", + "LINE":"There to meet with Macbeth." + }, + { + "SPEAKER":"First Witch", + "LINE":"I come, Graymalkin!" + }, + { + "SPEAKER":"Second Witch", + "LINE":"Paddock calls." + }, + { + "SPEAKER":"Third Witch", + "LINE":"Anon." + }, + { + "SPEAKER":"ALL", + "LINE":[ + "Fair is foul, and foul is fair:", + "Hover through the fog and filthy air." + ] + } + ] + }] +} \ No newline at end of file diff --git a/tests/json/json_reader.py b/tests/json/json_reader.py new file mode 100644 index 0000000..2748269 --- /dev/null +++ b/tests/json/json_reader.py @@ -0,0 +1,33 @@ +from glob import glob +import os + +from ianalyzer_readers.extract import JSON +from ianalyzer_readers.readers.core import Field +from ianalyzer_readers.readers.json import JSONReader + + +class JSONTestReader(JSONReader): + """ + Example JSON reader for testing, using JSON data from https://github.com/tux255/analyzing-shakespeare + """ + + data_directory = os.path.join(os.path.dirname(__file__), "data") + + def sources(self, **kwargs): + for filename in glob(f"{self.data_directory}/*.json"): + full_path = os.path.join(self.data_directory, filename) + yield full_path + + act = Field("act", JSON("TITLE")) + scene = Field("scene", JSON("SCENE", "TITLE")) + character = Field("character", JSON("SCENE", "SPEECH", "SPEAKER")) + lines = Field( + "lines", JSON("SCENE", "SPEECH", "LINE"), transform=lambda x: " ".join(x) + ) + + fields = [ + act, + scene, + character, + lines, + ] diff --git a/tests/test_json_reader.py b/tests/test_json_reader.py new file mode 100644 index 0000000..1dd97a9 --- /dev/null +++ b/tests/test_json_reader.py @@ -0,0 +1,22 @@ +from tests.json.json_reader import JSONTestReader + +expected = [ + { + "act": "ACT I", + "scene": "SCENE I. A desert place.", + "character": "First Witch", + "lines": [ + "When shall we three meet again", + "In thunder, lightning, or in rain?", + ], + } +] + + +def test_json_read_file(): + reader = JSONTestReader() + docs = reader.documents() + for doc, target in zip(docs, expected): + assert len(target.keys()) == len(doc.keys()) + for key in target.keys(): + assert doc.get(key) == target.get(key) From ed5813df5a014b8e450bca2f57773b307a3f0121 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 14 Nov 2024 11:41:23 +0100 Subject: [PATCH 04/13] enable hierarchical parsing of JSON sources --- ianalyzer_readers/extract.py | 9 ++---- ianalyzer_readers/readers/json.py | 48 ++++++++++++++++++++++++------- tests/json/json_reader.py | 1 + 3 files changed, 42 insertions(+), 16 deletions(-) diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py index ba3f860..b827605 100644 --- a/ianalyzer_readers/extract.py +++ b/ianalyzer_readers/extract.py @@ -506,12 +506,9 @@ def __init__(self, *keys: Iterable[str], **kwargs): def _apply(self, data, **kwargs): while self.keys: key = self.keys.pop(0) - try: - data = data.get(key) - except AttributeError: - data = data[0].get(key) - return data - + output = data.get(key) + self._apply(output) + return output class RDF(Extractor): ''' An extractor to extract data from RDF triples diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py index 6d6e7b4..8bf681d 100644 --- a/ianalyzer_readers/readers/json.py +++ b/ianalyzer_readers/readers/json.py @@ -8,29 +8,36 @@ import ianalyzer_readers.extract as extract class JSONReader(Reader): - ''' + """ A base class for Readers of JSON encoded data. - ''' + + Attributes: + document_path (Iterable[str]): a keyword or list of keywords by which a list of documents can be extracted + """ + + document_path = [] def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]: - ''' - Given a Python dictionary, returns an iterable of extracted documents. - + """ + Given a Python dictionary, returns an iterable of extracted documents. + Parameters: - source: a - ''' + source: the input data + + Returns: + list of documents + """ if type(source) == tuple: metadata = source[1] json_data = self._get_json_data(source[0]) else: metadata = None json_data = self._get_json_data(source) + data = self._parse_json_tree(json_data) self._reject_extractors(extract.XML, extract.CSV, extract.RDF) field_dict = { - field.name: field.extractor.apply( - json_data, metadata=metadata, *nargs, **kwargs - ) + field.name: field.extractor.apply(data, metadata=metadata, *nargs, **kwargs) for field in self.fields } @@ -46,3 +53,24 @@ def _get_json_data(self, source: Source) -> dict: return json.loads(source) else: raise Exception("Unexpected source type for JSON Reader") + + def _parse_json_tree(self, data: dict, output: dict = {}) -> Iterable[dict]: + """Step through the dict recursively, collecting all data + Documents can be members of a list + """ + while len(self.document_path): + document_key = self.document_path.pop(0) + data_keys = data.keys() + for data_key in data_keys: + if data_key != document_key: + output[data_key] == data[data_key] + try: + path_content = data[document_key] + except KeyError: + raise Exception("path to identify documents is invalid") + if type(path_content) == list: + new_data = path_content.pop(0) + self._parse_json_tree(new_data, output) + else: + output[document_key] = path_content + return output diff --git a/tests/json/json_reader.py b/tests/json/json_reader.py index 2748269..d706852 100644 --- a/tests/json/json_reader.py +++ b/tests/json/json_reader.py @@ -12,6 +12,7 @@ class JSONTestReader(JSONReader): """ data_directory = os.path.join(os.path.dirname(__file__), "data") + document_path = ["SCENE", "SPEECH"] def sources(self, **kwargs): for filename in glob(f"{self.data_directory}/*.json"): From 13ed87a20236d4bd8a23227f8964f5a2b02f4fe4 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Wed, 20 Nov 2024 14:50:33 +0100 Subject: [PATCH 05/13] use pandas json_normalize for flattening nested data --- ianalyzer_readers/extract.py | 15 ++++----- ianalyzer_readers/readers/json.py | 56 ++++++++++++++----------------- pyproject.toml | 1 + requirements.txt | 12 ++++++- tests/json/json_reader.py | 27 ++++++++------- tests/test_json_reader.py | 37 +++++++++++++------- 6 files changed, 82 insertions(+), 66 deletions(-) diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py index b827605..6301bf8 100644 --- a/ianalyzer_readers/extract.py +++ b/ianalyzer_readers/extract.py @@ -499,16 +499,13 @@ class JSON(Extractor): Parameters: key: the key with which to retrieve a field from the source ''' - def __init__(self, *keys: Iterable[str], **kwargs): - self.keys = list(keys) - super().__init__(**kwargs) + def __init__(self, key, *args, **kwargs): + self.key = key + super().__init__(*args, **kwargs) + + def _apply(self, data, *args, **kwargs): + return data.get(self.key) - def _apply(self, data, **kwargs): - while self.keys: - key = self.keys.pop(0) - output = data.get(key) - self._apply(output) - return output class RDF(Extractor): ''' An extractor to extract data from RDF triples diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py index 8bf681d..2436a31 100644 --- a/ianalyzer_readers/readers/json.py +++ b/ianalyzer_readers/readers/json.py @@ -1,7 +1,8 @@ import json from os.path import isfile -from typing import Iterable +from typing import Iterable, Optional, Union +from pandas import json_normalize from requests import Response from .core import Reader, Document, Source @@ -11,11 +12,18 @@ class JSONReader(Reader): """ A base class for Readers of JSON encoded data. + The reader can either be used on a collection of JSON files, in which each file represents a document, + or for a JSON file containing lists of documents. + + If the attributes `record_path` and `meta` are passed, they are used as arguments to [pandas.json_normalize](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) to unnest the JSON data + Attributes: - document_path (Iterable[str]): a keyword or list of keywords by which a list of documents can be extracted + record_path: a keyword or list of keywords by which a list of documents can be extracted from a large JSON file; do not define if the corpus is structured as one file per document + meta: a list of keywords, or list of lists of keywords, by which metadata for each document can be located """ - document_path = [] + record_path: Optional[list[str]] = None + meta: Optional[list[Union[str, list[str]]]] = None def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]: """ @@ -33,15 +41,24 @@ def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]: else: metadata = None json_data = self._get_json_data(source) - data = self._parse_json_tree(json_data) + + if self.record_path and self.meta: + documents = json_normalize(json_data, self.record_path, self.meta).to_dict( + 'records' + ) + else: + documents = list(json_data) self._reject_extractors(extract.XML, extract.CSV, extract.RDF) - field_dict = { - field.name: field.extractor.apply(data, metadata=metadata, *nargs, **kwargs) - for field in self.fields - } + for doc in documents: + field_dict = { + field.name: field.extractor.apply( + doc, metadata=metadata, *nargs, **kwargs + ) + for field in self.fields + } - yield field_dict + yield field_dict def _get_json_data(self, source: Source) -> dict: if isfile(source): @@ -53,24 +70,3 @@ def _get_json_data(self, source: Source) -> dict: return json.loads(source) else: raise Exception("Unexpected source type for JSON Reader") - - def _parse_json_tree(self, data: dict, output: dict = {}) -> Iterable[dict]: - """Step through the dict recursively, collecting all data - Documents can be members of a list - """ - while len(self.document_path): - document_key = self.document_path.pop(0) - data_keys = data.keys() - for data_key in data_keys: - if data_key != document_key: - output[data_key] == data[data_key] - try: - path_content = data[document_key] - except KeyError: - raise Exception("path to identify documents is invalid") - if type(path_content) == list: - new_data = path_content.pop(0) - self._parse_json_tree(new_data, output) - else: - output[document_key] = path_content - return output diff --git a/pyproject.toml b/pyproject.toml index 242da96..0ef79ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "beautifulsoup4", "lxml", "openpyxl", + "pandas", "rdflib", ] diff --git a/requirements.txt b/requirements.txt index 4ac765f..8d644bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,12 +56,16 @@ mkdocstrings==0.24.1 # via mkdocstrings-python mkdocstrings-python==1.9.0 # via ianalyzer_readers (setup.py) +numpy==2.1.3 + # via pandas openpyxl==3.1.2 # via ianalyzer_readers (setup.py) packaging==24.0 # via # mkdocs # pytest +pandas==2.2.3 + # via ianalyzer_readers (setup.py) pathspec==0.12.1 # via mkdocs platformdirs==4.2.0 @@ -77,7 +81,11 @@ pyparsing==3.1.2 pytest==8.1.1 # via ianalyzer_readers (setup.py) python-dateutil==2.9.0.post0 - # via ghp-import + # via + # ghp-import + # pandas +pytz==2024.2 + # via pandas pyyaml==6.0.1 # via # mkdocs @@ -95,5 +103,7 @@ soupsieve==2.5 # via beautifulsoup4 tomli==2.0.1 # via pytest +tzdata==2024.2 + # via pandas watchdog==4.0.0 # via mkdocs diff --git a/tests/json/json_reader.py b/tests/json/json_reader.py index d706852..b4f131e 100644 --- a/tests/json/json_reader.py +++ b/tests/json/json_reader.py @@ -6,13 +6,20 @@ from ianalyzer_readers.readers.json import JSONReader +def merge_lines(lines: list | str) -> str: + if isinstance(lines, list): + return "\n".join(lines) + return lines + + class JSONTestReader(JSONReader): """ Example JSON reader for testing, using JSON data from https://github.com/tux255/analyzing-shakespeare """ data_directory = os.path.join(os.path.dirname(__file__), "data") - document_path = ["SCENE", "SPEECH"] + record_path = ["SCENE", "SPEECH"] + meta = ["TITLE", ["SPEECH", "TITLE"], ["SPEECH", "STAGEDIR"]] def sources(self, **kwargs): for filename in glob(f"{self.data_directory}/*.json"): @@ -20,15 +27,9 @@ def sources(self, **kwargs): yield full_path act = Field("act", JSON("TITLE")) - scene = Field("scene", JSON("SCENE", "TITLE")) - character = Field("character", JSON("SCENE", "SPEECH", "SPEAKER")) - lines = Field( - "lines", JSON("SCENE", "SPEECH", "LINE"), transform=lambda x: " ".join(x) - ) - - fields = [ - act, - scene, - character, - lines, - ] + scene = Field("scene", JSON("SPEECH.TITLE")) + character = Field("character", JSON("SPEAKER")) + lines = Field("lines", JSON("LINE", transform=merge_lines)) + stage_dir = Field("stage_direction", JSON("SPEECH.STAGEDIR", transform=merge_lines)) + + fields = [act, scene, character, lines, stage_dir] diff --git a/tests/test_json_reader.py b/tests/test_json_reader.py index 1dd97a9..8e84583 100644 --- a/tests/test_json_reader.py +++ b/tests/test_json_reader.py @@ -2,21 +2,32 @@ expected = [ { - "act": "ACT I", - "scene": "SCENE I. A desert place.", - "character": "First Witch", - "lines": [ - "When shall we three meet again", - "In thunder, lightning, or in rain?", - ], - } + 'act': 'ACT I', + 'scene': 'SCENE I. A desert place.', + 'stage_direction': 'Thunder and lightning. Enter three Witches\nExeunt', + 'character': 'First Witch', + 'lines': 'When shall we three meet again\nIn thunder, lightning, or in rain?', + }, + *[{}] * 8, + { + 'act': 'ACT I', + 'scene': 'SCENE I. A desert place.', + 'stage_direction': 'Thunder and lightning. Enter three Witches\nExeunt', + 'character': 'ALL', + 'lines': "Fair is foul, and foul is fair:\nHover through the fog and filthy air.", + }, ] def test_json_read_file(): reader = JSONTestReader() - docs = reader.documents() - for doc, target in zip(docs, expected): - assert len(target.keys()) == len(doc.keys()) - for key in target.keys(): - assert doc.get(key) == target.get(key) + docs = list(reader.documents()) + assert len(docs) == len(expected) + _assert_matches(expected[0], docs[0]) + _assert_matches(expected[-1], docs[-1]) + + +def _assert_matches(target: dict, doc: dict): + assert len(target.keys()) == len(doc.keys()) + for key in target.keys(): + assert doc.get(key) == target.get(key) From 2fde2a863f985799ae4655569a7efca164ead92a Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 21 Nov 2024 12:13:35 +0100 Subject: [PATCH 06/13] feat: also allow deep nested single document JSON files --- ianalyzer_readers/extract.py | 22 +++++++++++------ ianalyzer_readers/readers/json.py | 3 ++- tests/json/json_reader.py | 41 ++++++++++++++++++++++++++++--- tests/test_json_reader.py | 16 +++++++++--- 4 files changed, 66 insertions(+), 16 deletions(-) diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py index 7d32e86..f05cad9 100644 --- a/ianalyzer_readers/extract.py +++ b/ianalyzer_readers/extract.py @@ -493,18 +493,24 @@ def _apply(self, metadata, *nargs, **kwargs): class JSON(Extractor): - ''' An extractor to extract data from JSON + '''An extractor to extract data from JSON This extractor assumes that each source is a flat dictionary - + Parameters: - key: the key with which to retrieve a field from the source + keys (Iterable[str]): the keys with which to retrieve a field value from the source ''' - def __init__(self, key, *args, **kwargs): - self.key = key - super().__init__(*args, **kwargs) - def _apply(self, data, *args, **kwargs): - return data.get(self.key) + def __init__(self, *keys, **kwargs): + self.keys = list(keys) + super().__init__(**kwargs) + + def _apply(self, data: Union[str, dict], key_index: int = 0, **kwargs) -> str: + key = self.keys[key_index] + data = data.get(key) + if len(self.keys) > key_index + 1: + key_index += 1 + return self._apply(data, key_index) + return data class RDF(Extractor): diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py index 2436a31..ac6235e 100644 --- a/ianalyzer_readers/readers/json.py +++ b/ianalyzer_readers/readers/json.py @@ -47,7 +47,8 @@ def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]: 'records' ) else: - documents = list(json_data) + documents = [json_data] + self._reject_extractors(extract.XML, extract.CSV, extract.RDF) for doc in documents: diff --git a/tests/json/json_reader.py b/tests/json/json_reader.py index b4f131e..b5fa52d 100644 --- a/tests/json/json_reader.py +++ b/tests/json/json_reader.py @@ -1,4 +1,5 @@ from glob import glob +import json import os from ianalyzer_readers.extract import JSON @@ -12,20 +13,52 @@ def merge_lines(lines: list | str) -> str: return lines -class JSONTestReader(JSONReader): +class JSONDocumentReader(JSONReader): """ - Example JSON reader for testing, using JSON data from https://github.com/tux255/analyzing-shakespeare + Example reader that would operate on corpora with one json file per document """ data_directory = os.path.join(os.path.dirname(__file__), "data") - record_path = ["SCENE", "SPEECH"] - meta = ["TITLE", ["SPEECH", "TITLE"], ["SPEECH", "STAGEDIR"]] + + def sources(self, **kwargs): + for i in range(1): + data = json.dumps( + { + "TITLE": "ACT I", + "SCENE": { + "TITLE": "SCENE I. A desert place.", + "STAGEDIR": [ + "Thunder and lightning. Enter three Witches", + "Exeunt", + ], + "SPEECH": { + "SPEAKER": "First Witch", + }, + }, + } + ) + yield data.encode('utf-8') + + act = Field("act", JSON("TITLE")) + character = Field("character", JSON("SCENE", "SPEECH", "SPEAKER")) + scene = Field("scene", JSON("SCENE", "TITLE")) + + fields = [act, character, scene] + + +class JSONMultipleDocumentReader(JSONDocumentReader): + """ + Example JSON reader for testing parsing arrays in JSON, using JSON data from https://github.com/tux255/analyzing-shakespeare + """ def sources(self, **kwargs): for filename in glob(f"{self.data_directory}/*.json"): full_path = os.path.join(self.data_directory, filename) yield full_path + record_path = ["SCENE", "SPEECH"] + meta = ["TITLE", ["SPEECH", "TITLE"], ["SPEECH", "STAGEDIR"]] + act = Field("act", JSON("TITLE")) scene = Field("scene", JSON("SPEECH.TITLE")) character = Field("character", JSON("SPEAKER")) diff --git a/tests/test_json_reader.py b/tests/test_json_reader.py index 8e84583..d8111cb 100644 --- a/tests/test_json_reader.py +++ b/tests/test_json_reader.py @@ -1,4 +1,4 @@ -from tests.json.json_reader import JSONTestReader +from tests.json.json_reader import JSONDocumentReader, JSONMultipleDocumentReader expected = [ { @@ -19,8 +19,18 @@ ] -def test_json_read_file(): - reader = JSONTestReader() +def test_json_parse_single_document(): + reader = JSONDocumentReader() + docs = list(reader.documents()) + assert len(docs) == 1 + assert docs[0].get('act') == 'ACT I' + assert docs[0].get('character') == 'First Witch' + assert docs[0].get('scene') == 'SCENE I. A desert place.' + + +def test_json_parse_multiple_documents(): + '''test that JSON reader can parse multiple documents from an array in a single file''' + reader = JSONMultipleDocumentReader() docs = list(reader.documents()) assert len(docs) == len(expected) _assert_matches(expected[0], docs[0]) From 5baebc0728f610e5f2427fc559f96381fb6b5416 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 21 Nov 2024 12:21:10 +0100 Subject: [PATCH 07/13] update requirements --- pyproject.toml | 1 + requirements.txt | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 7e189fb..26cf5a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "lxml", "openpyxl", "pandas", + "requests", "rdflib", ] diff --git a/requirements.txt b/requirements.txt index 8d644bc..575d362 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,10 @@ # beautifulsoup4==4.12.3 # via ianalyzer_readers (setup.py) +certifi==2024.8.30 + # via requests +charset-normalizer==3.4.0 + # via requests click==8.1.7 # via # mkdocs @@ -20,6 +24,8 @@ ghp-import==2.1.0 # via mkdocs griffe==0.42.0 # via mkdocstrings-python +idna==3.10 + # via requests iniconfig==2.0.0 # via pytest isodate==0.6.1 @@ -95,6 +101,8 @@ pyyaml-env-tag==0.1 # via mkdocs rdflib==7.0.0 # via ianalyzer_readers (setup.py) +requests==2.32.3 + # via ianalyzer_readers (setup.py) six==1.16.0 # via # isodate @@ -105,5 +113,7 @@ tomli==2.0.1 # via pytest tzdata==2024.2 # via pandas +urllib3==2.2.3 + # via requests watchdog==4.0.0 # via mkdocs From 4fdc3df8f62f6f874353a7805d599f4d723c10ba Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 21 Nov 2024 12:22:56 +0100 Subject: [PATCH 08/13] fix typing issue --- tests/json/json_reader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/json/json_reader.py b/tests/json/json_reader.py index b5fa52d..cfb2784 100644 --- a/tests/json/json_reader.py +++ b/tests/json/json_reader.py @@ -1,13 +1,14 @@ from glob import glob import json import os +from typing import Union from ianalyzer_readers.extract import JSON from ianalyzer_readers.readers.core import Field from ianalyzer_readers.readers.json import JSONReader -def merge_lines(lines: list | str) -> str: +def merge_lines(lines: Union[list, str]) -> str: if isinstance(lines, list): return "\n".join(lines) return lines From 5b2b42a0395d708ef1188c2da60c1d2d3a90842f Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 21 Nov 2024 12:26:01 +0100 Subject: [PATCH 09/13] fix typing issue Python 3.8 --- ianalyzer_readers/readers/json.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py index ac6235e..cf10180 100644 --- a/ianalyzer_readers/readers/json.py +++ b/ianalyzer_readers/readers/json.py @@ -1,6 +1,6 @@ import json from os.path import isfile -from typing import Iterable, Optional, Union +from typing import Iterable, List, Optional, Union from pandas import json_normalize from requests import Response @@ -22,8 +22,8 @@ class JSONReader(Reader): meta: a list of keywords, or list of lists of keywords, by which metadata for each document can be located """ - record_path: Optional[list[str]] = None - meta: Optional[list[Union[str, list[str]]]] = None + record_path: Optional[List[str]] = None + meta: Optional[List[Union[str, List[str]]]] = None def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]: """ From 48401a083294e073800bff8f2dc79bd4f28fa182 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 19 Dec 2024 11:49:47 +0100 Subject: [PATCH 10/13] feat: improve documentation --- docs/api.md | 6 +++ ianalyzer_readers/extract.py | 6 ++- ianalyzer_readers/readers/json.py | 84 ++++++++++++++++++++++++++++--- 3 files changed, 88 insertions(+), 8 deletions(-) diff --git a/docs/api.md b/docs/api.md index 6833b96..5859d25 100644 --- a/docs/api.md +++ b/docs/api.md @@ -36,6 +36,12 @@ __Module:__ `ianalyzer_readers.readers.rdf` ::: ianalyzer_readers.readers.rdf +## JSON reader + +__Module:__ `ianalyzer_readers.readers.json` + +::: ianalyzer_readers.readers.json + ## Extractors __Module:__ `ianalyzer_readers.extract` diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py index f05cad9..82d29a6 100644 --- a/ianalyzer_readers/extract.py +++ b/ianalyzer_readers/extract.py @@ -493,8 +493,10 @@ def _apply(self, metadata, *nargs, **kwargs): class JSON(Extractor): - '''An extractor to extract data from JSON - This extractor assumes that each source is a flat dictionary + ''' + An extractor to extract data from JSON. + This extractor assumes that each source is dictionary without nested lists. + When working with nested lists, use JSONReader to unnest. Parameters: keys (Iterable[str]): the keys with which to retrieve a field value from the source diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py index cf10180..8e161f0 100644 --- a/ianalyzer_readers/readers/json.py +++ b/ianalyzer_readers/readers/json.py @@ -1,3 +1,10 @@ +''' +This module defines the JSONReader. + +It can parse documents nested in one file, for which it uses the pandas library, +or multiple files with one document each, which use the generic Python json parser. +''' + import json from os.path import isfile from typing import Iterable, List, Optional, Union @@ -9,21 +16,86 @@ import ianalyzer_readers.extract as extract class JSONReader(Reader): - """ + ''' A base class for Readers of JSON encoded data. - The reader can either be used on a collection of JSON files, in which each file represents a document, + The reader can either be used on a collection of JSON files (`single_document=True`), in which each file represents a document, or for a JSON file containing lists of documents. - If the attributes `record_path` and `meta` are passed, they are used as arguments to [pandas.json_normalize](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) to unnest the JSON data + If the attributes `record_path` and `meta` are set, they are used as arguments to [pandas.json_normalize](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) to unnest the JSON data Attributes: - record_path: a keyword or list of keywords by which a list of documents can be extracted from a large JSON file; do not define if the corpus is structured as one file per document - meta: a list of keywords, or list of lists of keywords, by which metadata for each document can be located + single_document: indicates whether the data is organized such that a file represents a single document + record_path: a keyword or list of keywords by which a list of documents can be extracted from a large JSON file; irrelevant if `single_document = True` + meta: a list of keywords, or list of lists of keywords, by which metadata for each document can be located; irrelevant if `single_document = True` """ + Examples: + ##### Multiple documents in one file: + ```python + example_data = { + 'path': { + 'sketch': 'Hungarian Phrasebook', + 'episode': 25, + 'to': { + 'records': + [ + {'speech': 'I will not buy this record. It is scratched.', 'character': 'tourist'}, + {'speech': "No sir. This is a tobacconist's.", 'character': 'tobacconist'} + ] + } + } + } + + MyJSONReader(JSONReader): + record_path = ['path', 'to', 'records'] + meta = [['path', 'sketch'], ['path', 'episode']] + + speech = Field('speech', JSON('speech')) + character = Field('character', JSON('character')) + sketch = Field('sketch', JSON('path.sketch')) # field name results from paths in `meta` array, separated by a dot + episode = Field('episode', JSON('path.episode')) + ``` + + ##### Single document per file: + ```python + example_data = { + 'sketch': 'Hungarian Phrasebook', + 'episode': 25, + 'scene': { + 'character': 'tourist', + 'speech': 'I will not buy this record. It is scratched.' + } + } + + MyJSONReader(JSONReader): + single_document = True + + speech = Field('speech', JSON('scene', 'speech')) + character = Field('character', JSON('scene', 'character)) + sketch = Field('sketch', JSON('sketch')) + episode = Field('episode', JSON('episode)) + ``` + + ''' + + single_document: bool = False + ''' + set to `True` if the data is structured such that one document is encoded in one .json file + in that case, the reader assumes that there are no lists in such a file + ''' + record_path: Optional[List[str]] = None + ''' + a keyword or list of keywords by which a list of documents can be extracted from a large JSON file. + Only relevant if `single_document=False`. + ''' + meta: Optional[List[Union[str, List[str]]]] = None + ''' + a list of keywords, or list of lists of keywords, by which metadata for each document can be located, + if it is in a different path than `record_path`. Only relevant if `single_document=False`. + ''' def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]: """ @@ -42,7 +114,7 @@ def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]: metadata = None json_data = self._get_json_data(source) - if self.record_path and self.meta: + if not self.single_document: documents = json_normalize(json_data, self.record_path, self.meta).to_dict( 'records' ) From 7b296cb439dfa8ddd4391e49d613848aa8c063ec Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 19 Dec 2024 11:52:06 +0100 Subject: [PATCH 11/13] reference JSON capacity in README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7de83c6..e9323f7 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Python package](https://github.com/UUDigitalHumanitieslab/ianalyzer-readers/actions/workflows/python-package.yml/badge.svg)](https://github.com/UUDigitalHumanitieslab/ianalyzer-readers/actions/workflows/python-package.yml) [![Documentation Status](https://readthedocs.org/projects/ianalyzer-readers/badge/?version=latest)](https://ianalyzer-readers.readthedocs.io/en/latest/?badge=latest) -`ianalyzer-readers` is a python module to extract data from XML, HTML, CSV, XLSX or TTL files. +`ianalyzer-readers` is a python module to extract data from XML, HTML, CSV, JSON, XLSX or RDF (Linked Data) files. This module was originally created for [I-analyzer](https://github.com/UUDigitalHumanitieslab/I-analyzer), a web application that extracts data from a variety of datasets, indexes them and presents a search interface. To do this, we wanted a way to extract data from source files without having to write a new script "from scratch" for each dataset, and an API that would work the same regardless of the source file type. @@ -25,7 +25,7 @@ Our primary use for this package is to pre-process data for I-analyzer, but you Using this package makes sense if you want to extract data in the shape that it is designed for (i.e., a list of flat dictionaries). -What we find especially useful is that all subclasses of `Reader` have the same interface - regardless of whether they are processing CSV, XML, HTML, or XLSX data. That common interface is crucial in an application that needs to process corpora from different source types, like I-analyzer. +What we find especially useful is that all subclasses of `Reader` have the same interface - regardless of whether they are processing CSV, JSON, XML, HTML, RDF or XLSX data. That common interface is crucial in an application that needs to process corpora from different source types, like I-analyzer. ## Usage From f963d3eab34c6ecab795417ebe16de2f6a5efa8a Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 19 Dec 2024 12:58:58 +0100 Subject: [PATCH 12/13] fix unit test --- ianalyzer_readers/readers/json.py | 6 +++--- tests/json/json_reader.py | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py index 8e161f0..cd03fd6 100644 --- a/ianalyzer_readers/readers/json.py +++ b/ianalyzer_readers/readers/json.py @@ -115,9 +115,9 @@ def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]: json_data = self._get_json_data(source) if not self.single_document: - documents = json_normalize(json_data, self.record_path, self.meta).to_dict( - 'records' - ) + documents = json_normalize( + json_data, record_path=self.record_path, meta=self.meta + ).to_dict('records') else: documents = [json_data] diff --git a/tests/json/json_reader.py b/tests/json/json_reader.py index cfb2784..3ad67c5 100644 --- a/tests/json/json_reader.py +++ b/tests/json/json_reader.py @@ -20,6 +20,7 @@ class JSONDocumentReader(JSONReader): """ data_directory = os.path.join(os.path.dirname(__file__), "data") + single_document = True def sources(self, **kwargs): for i in range(1): @@ -47,23 +48,22 @@ def sources(self, **kwargs): fields = [act, character, scene] -class JSONMultipleDocumentReader(JSONDocumentReader): +class JSONMultipleDocumentReader(JSONReader): """ Example JSON reader for testing parsing arrays in JSON, using JSON data from https://github.com/tux255/analyzing-shakespeare """ + data_directory = os.path.join(os.path.dirname(__file__), "data") + record_path = ["SCENE", "SPEECH"] + meta = ["TITLE", ["SCENE", "TITLE"], ["SCENE", "STAGEDIR"]] def sources(self, **kwargs): for filename in glob(f"{self.data_directory}/*.json"): - full_path = os.path.join(self.data_directory, filename) - yield full_path - - record_path = ["SCENE", "SPEECH"] - meta = ["TITLE", ["SPEECH", "TITLE"], ["SPEECH", "STAGEDIR"]] + yield filename act = Field("act", JSON("TITLE")) - scene = Field("scene", JSON("SPEECH.TITLE")) + scene = Field("scene", JSON("SCENE.TITLE")) character = Field("character", JSON("SPEAKER")) lines = Field("lines", JSON("LINE", transform=merge_lines)) - stage_dir = Field("stage_direction", JSON("SPEECH.STAGEDIR", transform=merge_lines)) + stage_dir = Field("stage_direction", JSON("SCENE.STAGEDIR", transform=merge_lines)) fields = [act, scene, character, lines, stage_dir] From d792f00457fb24b1e4d0b0c80155867141b01536 Mon Sep 17 00:00:00 2001 From: BeritJanssen Date: Thu, 19 Dec 2024 13:12:34 +0100 Subject: [PATCH 13/13] drop Python 3.8 support --- .github/workflows/python-package.yml | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 55ab63e..8aed826 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 diff --git a/README.md b/README.md index e9323f7..628ae82 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ The basic usage is that you will use the utilities in this package to create a " ## Prerequisites -Requires Python 3.8 or later. +Requires Python 3.9 or later. ## Contents