Skip to content

Commit

Permalink
make meta from DOCXToDocument JSON serializable
Browse files Browse the repository at this point in the history
  • Loading branch information
anakin87 committed Aug 21, 2024
1 parent aca8f09 commit 4f667f7
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 17 deletions.
14 changes: 8 additions & 6 deletions haystack/components/converters/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,13 @@ class DOCXMetadata:
category: str
comments: str
content_status: str
created: Optional[datetime]
created: Optional[str]
identifier: str
keywords: str
language: str
last_modified_by: str
last_printed: Optional[datetime]
modified: Optional[datetime]
last_printed: Optional[str]
modified: Optional[str]
revision: int
subject: str
title: str
Expand Down Expand Up @@ -192,13 +192,15 @@ def _get_docx_metadata(self, document: "DocxDocument") -> DOCXMetadata:
category=document.core_properties.category,
comments=document.core_properties.comments,
content_status=document.core_properties.content_status,
created=document.core_properties.created,
created=document.core_properties.created.isoformat() if document.core_properties.created else None,
identifier=document.core_properties.identifier,
keywords=document.core_properties.keywords,
language=document.core_properties.language,
last_modified_by=document.core_properties.last_modified_by,
last_printed=document.core_properties.last_printed,
modified=document.core_properties.modified,
last_printed=document.core_properties.last_printed.isoformat()
if document.core_properties.last_printed
else None,
modified=document.core_properties.modified.isoformat() if document.core_properties.modified else None,
revision=document.core_properties.revision,
subject=document.core_properties.subject,
title=document.core_properties.title,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
fixes:
- |
The metadata produced by `DOCXToDocument` component is now JSON serializable.
Previously, it contained `datetime` objects automatically extracted from DOCX files, which are not JSON serializable.
Now, the `datetime` objects are converted to strings.
26 changes: 15 additions & 11 deletions test/components/converters/test_docx_file_to_document.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
import datetime
import json

import pytest

Expand Down Expand Up @@ -34,21 +34,21 @@ def test_run(self, test_files_path, docx_converter):
category="",
comments="",
content_status="",
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
created="2024-06-09T21:17:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
modified="2024-06-09T21:27:00+00:00",
revision=2,
subject="",
title="",
version="",
),
}

def test_run_with_meta_overwrites(self, test_files_path, docx_converter):
def test_run_with_additional_meta(self, test_files_path, docx_converter):
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
doc = output["documents"][0]
Expand All @@ -59,13 +59,13 @@ def test_run_with_meta_overwrites(self, test_files_path, docx_converter):
category="",
comments="",
content_status="",
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
created="2024-06-09T21:17:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
modified="2024-06-09T21:27:00+00:00",
revision=2,
subject="",
title="",
Expand All @@ -82,7 +82,7 @@ def test_run_error_wrong_file_type(self, caplog, test_files_path, docx_converter
assert "doc_1.txt and convert it" in caplog.text
assert results["documents"] == []

def test_run_error_non_existent_file(self, test_files_path, docx_converter, caplog):
def test_run_error_non_existent_file(self, docx_converter, caplog):
"""
Test if the component correctly handles errors.
"""
Expand Down Expand Up @@ -121,13 +121,13 @@ def test_document_with_docx_metadata_to_dict(self):
category="category",
comments="comments",
content_status="",
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
created="2024-06-09T21:17:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
modified="2024-06-09T21:27:00+00:00",
revision=2,
subject="",
title="",
Expand All @@ -149,17 +149,21 @@ def test_document_with_docx_metadata_to_dict(self):
"category": "category",
"comments": "comments",
"content_status": "",
"created": datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
"created": "2024-06-09T21:17:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
"modified": "2024-06-09T21:27:00+00:00",
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
},
}

# check it is JSON serializable
json_str = json.dumps(doc.to_dict(flatten=False))
assert json.loads(json_str) == doc.to_dict(flatten=False)

0 comments on commit 4f667f7

Please sign in to comment.