From a9ccb61e71aec371018d69211b5519f117f3b178 Mon Sep 17 00:00:00 2001 From: Kalyan Dutia Date: Mon, 7 Oct 2024 10:25:47 +0100 Subject: [PATCH] to_passage_level_json separates naming of text block fields (#125) * to_passage_level_json separates naming of text block fields * rename block_index -> text_block.index * bump version --- src/cpr_sdk/parser_models.py | 29 +++++++++++++++++++++++------ src/cpr_sdk/version.py | 4 ++-- tests/test_parser_models.py | 8 ++++---- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/src/cpr_sdk/parser_models.py b/src/cpr_sdk/parser_models.py index d5fcfb9..82fc6dc 100644 --- a/src/cpr_sdk/parser_models.py +++ b/src/cpr_sdk/parser_models.py @@ -378,6 +378,20 @@ def from_flat_json(data: dict): return ParserOutput.model_validate(unflattened) + @staticmethod + def _rename_text_block_keys( + keys: Union[list[str], dict[str, Any]] + ) -> Union[list[str], dict[str, Any]]: + """Prepend text_block. to the keys in the dictionary or list.""" + + if isinstance(keys, list): + return [f"text_block.{key}" for key in keys] + + if isinstance(keys, dict): + return {f"text_block.{key}": value for key, value in keys.items()} + + raise ValueError("keys must be a list or a dictionary") + def to_passage_level_json(self, include_empty: bool = True) -> list[dict[str, Any]]: """ Convert the parser output to a passage-level JSON format. @@ -412,28 +426,31 @@ def to_passage_level_json(self, include_empty: bool = True) -> list[dict[str, An ) ) - empty_html_text_block_keys: list[str] = list(HTMLTextBlock.model_fields.keys()) - empty_pdf_text_block_keys: list[str] = list(PDFTextBlock.model_fields.keys()) + empty_html_text_block_keys: list[str] = self._rename_text_block_keys(list(HTMLTextBlock.model_fields.keys())) # type: ignore + empty_pdf_text_block_keys: list[str] = self._rename_text_block_keys(list(PDFTextBlock.model_fields.keys())) # type: ignore if not self.text_blocks: passages_array_filled = [ {key: None for key in empty_html_text_block_keys} | {key: None for key in empty_pdf_text_block_keys} | fixed_fields_dict - | {"block_index": 0, PDF_PAGE_METADATA_KEY: None} + | {"text_block.index": 0, PDF_PAGE_METADATA_KEY: None} ] return passages_array_filled passages_array = [ fixed_fields_dict - | json.loads(block.model_dump_json(exclude={"text"})) - | {"text": block.to_string(), "block_index": idx} + | self._rename_text_block_keys( + json.loads(block.model_dump_json(exclude={"text"})) + ) + | {"text_block.text": block.to_string(), "text_block.index": idx} for idx, block in enumerate(self.text_blocks) ] + # TODO: do we need this code? for passage in passages_array: - page_number = passage.get("page_number", None) + page_number = passage.get("text_block.page_number", None) passage[PDF_PAGE_METADATA_KEY] = ( self.get_page_metadata_by_page_number(page_number) if page_number is not None diff --git a/src/cpr_sdk/version.py b/src/cpr_sdk/version.py index bb9ea3b..b593b2d 100644 --- a/src/cpr_sdk/version.py +++ b/src/cpr_sdk/version.py @@ -1,6 +1,6 @@ _MAJOR = "1" -_MINOR = "7" -_PATCH = "1" +_MINOR = "8" +_PATCH = "0" _SUFFIX = "" VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR) diff --git a/tests/test_parser_models.py b/tests/test_parser_models.py index 1da4384..dad563a 100644 --- a/tests/test_parser_models.py +++ b/tests/test_parser_models.py @@ -176,11 +176,11 @@ def test_to_passage_level_json_method( kwarg in the `to_passage_level_json` method """ expected_top_level_fields = set( - list(TextBlock.model_fields.keys()) - + list(HTMLTextBlock.model_fields.keys()) - + list(PDFTextBlock.model_fields.keys()) + [f"text_block.{k}" for k in list(TextBlock.model_fields.keys())] + + [f"text_block.{k}" for k in list(HTMLTextBlock.model_fields.keys())] + + [f"text_block.{k}" for k in list(PDFTextBlock.model_fields.keys())] + list(ParserOutput.model_fields.keys()) - + ["block_index", PDF_PAGE_METADATA_KEY] + + ["text_block.index", PDF_PAGE_METADATA_KEY] ) expected_document_metadata_fields = set(BackendDocument.model_fields.keys())