Adding a method to the ParserOutput object to convert the object to a…

… passage level array.
climatepolicyradar · Apr 16, 2024 · 258039b · 258039b
1 parent 5e21d2b
commit 258039b
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 7 deletions.
diff --git a/src/cpr_sdk/parser_models.py b/src/cpr_sdk/parser_models.py
@@ -3,7 +3,8 @@
 from collections import Counter
 from datetime import date
 from enum import Enum
-from typing import List, Optional, Sequence, Tuple, TypeVar, Union
+import json
+from typing import List, Optional, Sequence, Tuple, TypeVar, Union, Any
 
 from cpr_sdk.pipeline_general_models import (
     CONTENT_TYPE_HTML,
@@ -373,3 +374,54 @@ def from_flat_json(data: dict):
         unflattened = remove_key_if_all_nested_vals_none(unflattened, "pdf_data")
 
         return ParserOutput.model_validate(unflattened)
+
+    def to_passage_level_json(self) -> list[dict[str, Any]]:
+        """
+        Convert the parser output to a passage-level JSON format.
+
+        In passage-level format we have a row for every text block in the document. This
+        is as for natural language processing tasks we often want to work with text at
+        the passage level.
+
+        HTML data won't contain PDF fields and vice versa, thus we must fill this in.
+        We could rely on the hugging face dataset transformation to fill in the missing
+        fields, but this is more explicit and provides default values.
+
+        The reason we convert from the pydantic BaseModel to a string using the
+        model_dump_json method and then reloading with json.load is as objects like
+        Enums and child pydantic objects persist when using the model_dump method.
+        We don't want these when we push to huggingface.
+        """
+        if self.text_blocks is None:
+            return []
+
+        common_fields_dict = json.loads(
+            self.model_dump_json(
+                exclude={
+                    "pdf_data": {"text_blocks", "page_metadata"},
+                    "html_data": {"text_blocks"},
+                }
+            )
+        )
+
+        passages_array = [
+            common_fields_dict
+            | json.loads(block.model_dump_json(exclude={"text"}))
+            | {"text": block.to_string(), "block_index": idx}
+            for idx, block in enumerate(self.text_blocks)
+        ]
+
+        empty_html_text_block_keys: list[str] = list(HTMLTextBlock.model_fields.keys())
+        empty_pdf_text_block_keys: list[str] = list(PDFTextBlock.model_fields.keys())
+
+        passages_array_filled = []
+        for passage in passages_array:
+            for key in empty_html_text_block_keys:
+                if key not in passage:
+                    passage[key] = None
+            for key in empty_pdf_text_block_keys:
+                if key not in passage:
+                    passage[key] = None
+            passages_array_filled.append(passage)
+
+        return passages_array_filled
diff --git a/src/cpr_sdk/version.py b/src/cpr_sdk/version.py
@@ -1,6 +1,6 @@
 _MAJOR = "1"
-_MINOR = "0"
-_PATCH = "2"
+_MINOR = "1"
+_PATCH = "0"
 _SUFFIX = ""
 
 VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)

diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py
@@ -1,15 +1,15 @@
 import pydantic
 import pytest
+
 from cpr_sdk.parser_models import (
     ParserInput,
     ParserOutput,
     PDFTextBlock,
     VerticalFlipError,
+    HTMLTextBlock,
+    TextBlock,
 )
-from cpr_sdk.pipeline_general_models import (
-    CONTENT_TYPE_HTML,
-    CONTENT_TYPE_PDF,
-)
+from cpr_sdk.pipeline_general_models import CONTENT_TYPE_HTML, CONTENT_TYPE_PDF
 
 
 def test_parser_input_object(parser_output_json_pdf) -> None:
@@ -150,3 +150,47 @@ def test_parser_output_object(
     with pytest.raises(pydantic.ValidationError) as context:
         ParserOutput.model_validate(parser_output_json_flat)
     parser_output = ParserOutput.from_flat_json(parser_output_json_flat)
+
+
+def test_to_passage_level_json_method(
+    parser_output_json_pdf: dict,
+    parser_output_json_html: dict,
+) -> None:
+    """Test that we can successfully create a passage level array from the text blocks."""
+    parser_output_pdf = ParserOutput.model_validate(parser_output_json_pdf)
+    passage_level_array_pdf = parser_output_pdf.to_passage_level_json()
+
+    parser_output_html = ParserOutput.model_validate(parser_output_json_html)
+    passage_level_array_html = parser_output_html.to_passage_level_json()
+
+    assert len(passage_level_array_pdf) == len(parser_output_pdf.text_blocks)
+    assert len(passage_level_array_html) == len(parser_output_html.text_blocks)
+
+    for passage_level_array in [passage_level_array_pdf, passage_level_array_html]:
+        assert all(isinstance(passage, dict) for passage in passage_level_array)
+
+        first_doc_keys = set(passage_level_array[0].keys())
+        assert all(
+            set(passage.keys()) == first_doc_keys for passage in passage_level_array
+        )
+
+        expected_model_fields = set(
+            list(TextBlock.model_fields.keys())
+            + list(HTMLTextBlock.model_fields.keys())
+            + list(PDFTextBlock.model_fields.keys())
+            + list(ParserOutput.model_fields.keys())
+            + ["block_index"]
+        )
+
+        assert all(
+            set(passage.keys()) == expected_model_fields
+            for passage in passage_level_array
+        )
+
+    passage_level_array_pdf_first_doc = passage_level_array_pdf[0]
+    passage_level_array_html_first_doc = passage_level_array_html[0]
+
+    assert (
+        passage_level_array_pdf_first_doc.keys()
+        == passage_level_array_html_first_doc.keys()
+    )