Skip to content

Commit

Permalink
to_passage_level_json separates naming of text block fields (#125)
Browse files Browse the repository at this point in the history
* to_passage_level_json separates naming of text block fields

* rename block_index -> text_block.index

* bump version
  • Loading branch information
kdutia authored Oct 7, 2024
1 parent 972cb02 commit a9ccb61
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 12 deletions.
29 changes: 23 additions & 6 deletions src/cpr_sdk/parser_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,20 @@ def from_flat_json(data: dict):

return ParserOutput.model_validate(unflattened)

@staticmethod
def _rename_text_block_keys(
keys: Union[list[str], dict[str, Any]]
) -> Union[list[str], dict[str, Any]]:
"""Prepend text_block. to the keys in the dictionary or list."""

if isinstance(keys, list):
return [f"text_block.{key}" for key in keys]

if isinstance(keys, dict):
return {f"text_block.{key}": value for key, value in keys.items()}

raise ValueError("keys must be a list or a dictionary")

def to_passage_level_json(self, include_empty: bool = True) -> list[dict[str, Any]]:
"""
Convert the parser output to a passage-level JSON format.
Expand Down Expand Up @@ -412,28 +426,31 @@ def to_passage_level_json(self, include_empty: bool = True) -> list[dict[str, An
)
)

empty_html_text_block_keys: list[str] = list(HTMLTextBlock.model_fields.keys())
empty_pdf_text_block_keys: list[str] = list(PDFTextBlock.model_fields.keys())
empty_html_text_block_keys: list[str] = self._rename_text_block_keys(list(HTMLTextBlock.model_fields.keys())) # type: ignore
empty_pdf_text_block_keys: list[str] = self._rename_text_block_keys(list(PDFTextBlock.model_fields.keys())) # type: ignore

if not self.text_blocks:
passages_array_filled = [
{key: None for key in empty_html_text_block_keys}
| {key: None for key in empty_pdf_text_block_keys}
| fixed_fields_dict
| {"block_index": 0, PDF_PAGE_METADATA_KEY: None}
| {"text_block.index": 0, PDF_PAGE_METADATA_KEY: None}
]

return passages_array_filled

passages_array = [
fixed_fields_dict
| json.loads(block.model_dump_json(exclude={"text"}))
| {"text": block.to_string(), "block_index": idx}
| self._rename_text_block_keys(
json.loads(block.model_dump_json(exclude={"text"}))
)
| {"text_block.text": block.to_string(), "text_block.index": idx}
for idx, block in enumerate(self.text_blocks)
]

# TODO: do we need this code?
for passage in passages_array:
page_number = passage.get("page_number", None)
page_number = passage.get("text_block.page_number", None)
passage[PDF_PAGE_METADATA_KEY] = (
self.get_page_metadata_by_page_number(page_number)
if page_number is not None
Expand Down
4 changes: 2 additions & 2 deletions src/cpr_sdk/version.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
_MAJOR = "1"
_MINOR = "7"
_PATCH = "1"
_MINOR = "8"
_PATCH = "0"
_SUFFIX = ""

VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
Expand Down
8 changes: 4 additions & 4 deletions tests/test_parser_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,11 +176,11 @@ def test_to_passage_level_json_method(
kwarg in the `to_passage_level_json` method
"""
expected_top_level_fields = set(
list(TextBlock.model_fields.keys())
+ list(HTMLTextBlock.model_fields.keys())
+ list(PDFTextBlock.model_fields.keys())
[f"text_block.{k}" for k in list(TextBlock.model_fields.keys())]
+ [f"text_block.{k}" for k in list(HTMLTextBlock.model_fields.keys())]
+ [f"text_block.{k}" for k in list(PDFTextBlock.model_fields.keys())]
+ list(ParserOutput.model_fields.keys())
+ ["block_index", PDF_PAGE_METADATA_KEY]
+ ["text_block.index", PDF_PAGE_METADATA_KEY]
)

expected_document_metadata_fields = set(BackendDocument.model_fields.keys())
Expand Down

0 comments on commit a9ccb61

Please sign in to comment.