UUDigitalHumanitieslab · BeritJanssen · Aug 29, 2024 · Sep 19, 2024 · Sep 26, 2024 · Nov 14, 2024
diff --git a/ianalyzer_readers/extract.py b/ianalyzer_readers/extract.py
@@ -467,6 +467,7 @@ def format(self, value):
         if value and value not in self.convert_to_none:
             return value
 
+
 class ExternalFile(Extractor):
     '''
     Free for all external file extractor that provides a stream to `stream_handler`
@@ -491,6 +492,27 @@ def _apply(self, metadata, *nargs, **kwargs):
         return self.stream_handler(open(metadata['associated_file'], 'r'))
 
 
+class JSON(Extractor):
+    '''An extractor to extract data from JSON
+    This extractor assumes that each source is a flat dictionary
+
+    Parameters:
+        keys (Iterable[str]): the keys with which to retrieve a field value from the source
+    '''
+
+    def __init__(self, *keys, **kwargs):
+        self.keys = list(keys)
+        super().__init__(**kwargs)
+
+    def _apply(self, data: Union[str, dict], key_index: int = 0, **kwargs) -> str:
+        key = self.keys[key_index]
+        data = data.get(key)
+        if len(self.keys) > key_index + 1:
+            key_index += 1
+            return self._apply(data, key_index)
+        return data
+
+
 class RDF(Extractor):
     """An extractor to extract data from RDF triples
 

diff --git a/ianalyzer_readers/readers/core.py b/ianalyzer_readers/readers/core.py
@@ -12,18 +12,23 @@
 import logging
 import csv
 
+from requests import Response
+
 logging.basicConfig(level=logging.WARNING)
-logging.getLogger('ianalyzer-readers').setLevel(logging.DEBUG)
+logger = logging.getLogger('ianalyzer-readers').setLevel(logging.DEBUG)
 
-Source = Union[str, Tuple[Union[str, bytes], Dict], bytes]
+SourceType = Union[str, Response, bytes]
+Source = Union[SourceType, Tuple[SourceType, Dict]]
 '''
 Type definition for the source input to some Reader methods.
 
 Sources are either:
 
 - a string with the path to a filename
-- a tuple containing a path to a filename, and a dictionary with metadata
-- binary data with the file contents. This is not supported on all Reader subclasses.
+- binary data with the file contents. This is not supported on all Reader subclasses
+- a requests.Response
+- a tuple of one of the above, and a dictionary with metadata
+
 '''
 
 Document = Dict[str, Any]

diff --git a/ianalyzer_readers/readers/json.py b/ianalyzer_readers/readers/json.py
@@ -0,0 +1,73 @@
+import json
+from os.path import isfile
+from typing import Iterable, List, Optional, Union
+
+from pandas import json_normalize
+from requests import Response
+
+from .core import Reader, Document, Source
+import ianalyzer_readers.extract as extract
+
+class JSONReader(Reader):
+    """
+    A base class for Readers of JSON encoded data.
+
+    The reader can either be used on a collection of JSON files, in which each file represents a document,
+    or for a JSON file containing lists of documents.
+
+    If the attributes `record_path` and `meta` are passed, they are used as arguments to [pandas.json_normalize](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.json_normalize.html) to unnest the JSON data
+
+    Attributes:
+        record_path: a keyword or list of keywords by which a list of documents can be extracted from a large JSON file; do not define if the corpus is structured as one file per document
+        meta: a list of keywords, or list of lists of keywords, by which metadata for each document can be located
+    """
+
+    record_path: Optional[List[str]] = None
+    meta: Optional[List[Union[str, List[str]]]] = None
+
+    def source2dicts(self, source: Source, *nargs, **kwargs) -> Iterable[Document]:
+        """
+        Given a Python dictionary, returns an iterable of extracted documents.
+
+        Parameters:
+            source: the input data
+
+        Returns:
+            list of documents
+        """
+        if type(source) == tuple:
+            metadata = source[1]
+            json_data = self._get_json_data(source[0])
+        else:
+            metadata = None
+            json_data = self._get_json_data(source)
+
+        if self.record_path and self.meta:
+            documents = json_normalize(json_data, self.record_path, self.meta).to_dict(
+                'records'
+            )
+        else:
+            documents = [json_data]
+
+        self._reject_extractors(extract.XML, extract.CSV, extract.RDF)
+
+        for doc in documents:
+            field_dict = {
+                field.name: field.extractor.apply(
+                    doc, metadata=metadata, *nargs, **kwargs
+                )
+                for field in self.fields
+            }
+
+            yield field_dict
+
+    def _get_json_data(self, source: Source) -> dict:
+        if isfile(source):
+            with open(source, "r") as f:
+                return json.load(f)
+        elif type(source) == Response:
+            return source.json()
+        elif type(source) == bytes:
+            return json.loads(source)
+        else:
+            raise Exception("Unexpected source type for JSON Reader")
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,8 @@ dependencies = [
   "beautifulsoup4",
   "lxml",
   "openpyxl",
+  "pandas",
+  "requests",
   "rdflib",
 ]
 

diff --git a/requirements.txt b/requirements.txt
@@ -6,6 +6,10 @@
 #
 beautifulsoup4==4.12.3
     # via ianalyzer_readers (setup.py)
+certifi==2024.8.30
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
 click==8.1.7
     # via
     #   mkdocs
@@ -20,6 +24,8 @@ ghp-import==2.1.0
     # via mkdocs
 griffe==0.42.0
     # via mkdocstrings-python
+idna==3.10
+    # via requests
 iniconfig==2.0.0
     # via pytest
 isodate==0.6.1
@@ -56,12 +62,16 @@ mkdocstrings==0.24.1
     # via mkdocstrings-python
 mkdocstrings-python==1.9.0
     # via ianalyzer_readers (setup.py)
+numpy==2.1.3
+    # via pandas
 openpyxl==3.1.2
     # via ianalyzer_readers (setup.py)
 packaging==24.0
     # via
     #   mkdocs
     #   pytest
+pandas==2.2.3
+    # via ianalyzer_readers (setup.py)
 pathspec==0.12.1
     # via mkdocs
 platformdirs==4.2.0
@@ -77,7 +87,11 @@ pyparsing==3.1.2
 pytest==8.1.1
     # via ianalyzer_readers (setup.py)
 python-dateutil==2.9.0.post0
-    # via ghp-import
+    # via
+    #   ghp-import
+    #   pandas
+pytz==2024.2
+    # via pandas
 pyyaml==6.0.1
     # via
     #   mkdocs
@@ -87,6 +101,8 @@ pyyaml-env-tag==0.1
     # via mkdocs
 rdflib==7.0.0
     # via ianalyzer_readers (setup.py)
+requests==2.32.3
+    # via ianalyzer_readers (setup.py)
 six==1.16.0
     # via
     #   isodate
@@ -95,5 +111,9 @@ soupsieve==2.5
     # via beautifulsoup4
 tomli==2.0.1
     # via pytest
+tzdata==2024.2
+    # via pandas
+urllib3==2.2.3
+    # via requests
 watchdog==4.0.0
     # via mkdocs
diff --git a/tests/json/data/Macbeth.json b/tests/json/data/Macbeth.json
@@ -0,0 +1,61 @@
+{"TITLE":"ACT I",
+"SCENE":[
+  {
+    "TITLE":"SCENE I.  A desert place.",
+    "STAGEDIR":[
+       "Thunder and lightning. Enter three Witches",
+       "Exeunt"
+    ],
+    "SPEECH":[
+       {
+          "SPEAKER":"First Witch",
+          "LINE":[
+             "When shall we three meet again",
+             "In thunder, lightning, or in rain?"
+          ]
+       },
+       {
+          "SPEAKER":"Second Witch",
+          "LINE":[
+             "When the hurlyburly's done,",
+             "When the battle's lost and won."
+          ]
+       },
+       {
+          "SPEAKER":"Third Witch",
+          "LINE":"That will be ere the set of sun."
+       },
+       {
+          "SPEAKER":"First Witch",
+          "LINE":"Where the place?"
+       },
+       {
+          "SPEAKER":"Second Witch",
+          "LINE":"Upon the heath."
+       },
+       {
+          "SPEAKER":"Third Witch",
+          "LINE":"There to meet with Macbeth."
+       },
+       {
+          "SPEAKER":"First Witch",
+          "LINE":"I come, Graymalkin!"
+       },
+         {
+            "SPEAKER":"Second Witch",
+            "LINE":"Paddock calls."
+         },
+         {
+            "SPEAKER":"Third Witch",
+            "LINE":"Anon."
+         },
+      {
+        "SPEAKER":"ALL",
+        "LINE":[
+           "Fair is foul, and foul is fair:",
+           "Hover through the fog and filthy air."
+        ]
+      }
+    ]
+  }]
+}
diff --git a/tests/json/json_reader.py b/tests/json/json_reader.py
@@ -0,0 +1,69 @@
+from glob import glob
+import json
+import os
+from typing import Union
+
+from ianalyzer_readers.extract import JSON
+from ianalyzer_readers.readers.core import Field
+from ianalyzer_readers.readers.json import JSONReader
+
+
+def merge_lines(lines: Union[list, str]) -> str:
+    if isinstance(lines, list):
+        return "\n".join(lines)
+    return lines
+
+
+class JSONDocumentReader(JSONReader):
+    """
+    Example reader that would operate on corpora with one json file per document
+    """
+
+    data_directory = os.path.join(os.path.dirname(__file__), "data")
+
+    def sources(self, **kwargs):
+        for i in range(1):
+            data = json.dumps(
+                {
+                    "TITLE": "ACT I",
+                    "SCENE": {
+                        "TITLE": "SCENE I.  A desert place.",
+                        "STAGEDIR": [
+                            "Thunder and lightning. Enter three Witches",
+                            "Exeunt",
+                        ],
+                        "SPEECH": {
+                            "SPEAKER": "First Witch",
+                        },
+                    },
+                }
+            )
+            yield data.encode('utf-8')
+
+    act = Field("act", JSON("TITLE"))
+    character = Field("character", JSON("SCENE", "SPEECH", "SPEAKER"))
+    scene = Field("scene", JSON("SCENE", "TITLE"))
+
+    fields = [act, character, scene]
+
+
+class JSONMultipleDocumentReader(JSONDocumentReader):
+    """
+    Example JSON reader for testing parsing arrays in JSON, using JSON data from https://github.com/tux255/analyzing-shakespeare
+    """
+
+    def sources(self, **kwargs):
+        for filename in glob(f"{self.data_directory}/*.json"):
+            full_path = os.path.join(self.data_directory, filename)
+            yield full_path
+
+    record_path = ["SCENE", "SPEECH"]
+    meta = ["TITLE", ["SPEECH", "TITLE"], ["SPEECH", "STAGEDIR"]]
+
+    act = Field("act", JSON("TITLE"))
+    scene = Field("scene", JSON("SPEECH.TITLE"))
+    character = Field("character", JSON("SPEAKER"))
+    lines = Field("lines", JSON("LINE", transform=merge_lines))
+    stage_dir = Field("stage_direction", JSON("SPEECH.STAGEDIR", transform=merge_lines))
+
+    fields = [act, scene, character, lines, stage_dir]
diff --git a/tests/test_json_reader.py b/tests/test_json_reader.py
@@ -0,0 +1,43 @@
+from tests.json.json_reader import JSONDocumentReader, JSONMultipleDocumentReader
+
+expected = [
+    {
+        'act': 'ACT I',
+        'scene': 'SCENE I.  A desert place.',
+        'stage_direction': 'Thunder and lightning. Enter three Witches\nExeunt',
+        'character': 'First Witch',
+        'lines': 'When shall we three meet again\nIn thunder, lightning, or in rain?',
+    },
+    *[{}] * 8,
+    {
+        'act': 'ACT I',
+        'scene': 'SCENE I.  A desert place.',
+        'stage_direction': 'Thunder and lightning. Enter three Witches\nExeunt',
+        'character': 'ALL',
+        'lines': "Fair is foul, and foul is fair:\nHover through the fog and filthy air.",
+    },
+]
+
+
+def test_json_parse_single_document():
+    reader = JSONDocumentReader()
+    docs = list(reader.documents())
+    assert len(docs) == 1
+    assert docs[0].get('act') == 'ACT I'
+    assert docs[0].get('character') == 'First Witch'
+    assert docs[0].get('scene') == 'SCENE I.  A desert place.'
+
+
+def test_json_parse_multiple_documents():
+    '''test that JSON reader can parse multiple documents from an array in a single file'''
+    reader = JSONMultipleDocumentReader()
+    docs = list(reader.documents())
+    assert len(docs) == len(expected)
+    _assert_matches(expected[0], docs[0])
+    _assert_matches(expected[-1], docs[-1])
+
+
+def _assert_matches(target: dict, doc: dict):
+    assert len(target.keys()) == len(doc.keys())
+    for key in target.keys():
+        assert doc.get(key) == target.get(key)