diff --git a/document_merge_service/api/apps.py b/document_merge_service/api/apps.py index 68a27710..78326e0e 100644 --- a/document_merge_service/api/apps.py +++ b/document_merge_service/api/apps.py @@ -10,3 +10,58 @@ class DefaultConfig(AppConfig): def ready(self): if "sqlite3" in settings.DATABASES["default"]["ENGINE"]: # pragma: no cover TextField.register_lookup(IContains, lookup_name="search") + mitigate_docxtpl_corruption_bug() + + +def mitigate_docxtpl_corruption_bug(): + # This is basically monkey-patching this PR: + # https://github.com/python-openxml/python-docx/pull/1436 + + # Hold my beer! + from docx.opc.constants import RELATIONSHIP_TYPE + + if hasattr(RELATIONSHIP_TYPE, "CORE_PROPERTIES_OFFICEDOCUMENT"): # pragma: no cover + raise Exception( + "The docxtpl mitigation is no longer required, please remove the monkeypatch code" + ) + + RELATIONSHIP_TYPE.CORE_PROPERTIES_OFFICEDOCUMENT = ( + "http://schemas.openxmlformats.org/officedocument/2006/relationships" + "/metadata/core-properties" + ) + + from docx.opc.package import RT, CorePropertiesPart, OpcPackage, cast + + @property + def _core_properties_part(self) -> CorePropertiesPart: + """|CorePropertiesPart| object related to this package. + + Creates a default core properties part if one is not present (not common). + """ + try: + return cast(CorePropertiesPart, self.part_related_by(RT.CORE_PROPERTIES)) + except KeyError: + try: + office_document_part = self.part_related_by( + RT.CORE_PROPERTIES_OFFICEDOCUMENT # type: ignore + ) + rel = self.relate_to( + office_document_part, + RT.CORE_PROPERTIES_OFFICEDOCUMENT, # type: ignore + ) + self.rels[rel].reltype = RT.CORE_PROPERTIES + return cast(CorePropertiesPart, office_document_part) + except KeyError: + core_properties_part = CorePropertiesPart.default(self) + self.relate_to(core_properties_part, RT.CORE_PROPERTIES) + return core_properties_part + + OpcPackage._core_properties_part = _core_properties_part + + from docx.opc.rel import _Relationship + + @_Relationship.reltype.setter + def reltype(self, value: str): + self._reltype = value + + _Relationship.reltype = reltype diff --git a/document_merge_service/api/data/created_with_libreoffice.docx b/document_merge_service/api/data/created_with_libreoffice.docx new file mode 100644 index 00000000..39f4956e Binary files /dev/null and b/document_merge_service/api/data/created_with_libreoffice.docx differ diff --git a/document_merge_service/api/tests/test_template.py b/document_merge_service/api/tests/test_template.py index d0964383..cb397a6d 100644 --- a/document_merge_service/api/tests/test_template.py +++ b/document_merge_service/api/tests/test_template.py @@ -2,7 +2,9 @@ import json import os import re -from collections import namedtuple +import tempfile +import zipfile +from collections import Counter, namedtuple import openpyxl import pytest @@ -916,3 +918,36 @@ def test_placeholder_with_unsupported_operand( with pytest.raises(exceptions.ValidationError) as exc_info: serializer.validate({"data": {"E_BAU_NUMBER": 12345}}) assert exc_info.value.args[0] == expected_error + + +def test_template_merge_docx_libreoffice_bug( + db, client, mock_filefield_name_validation, template, snapshot +): + """Verify a certain docx corruption bug does not occur. + + Certain versions of python-docx and python-docxtemplate cause corruption + of files that were originally created with LibreOffice. One effect of that + corruption is a duplicate entry in the document-internal files; there + are two docProps/core.xml files in the resulting document. + """ + file = django_file("created_with_libreoffice.docx") + template.template.save(os.path.basename(file.name), file) + template.engine = "docx-template" + template.save() + url = reverse("template-merge", args=[template.pk]) + + response = client.post(url, data={"data": {"test": "Test input"}}, format="json") + + with tempfile.NamedTemporaryFile(suffix=".docx") as tmp: + tmp.write(response.content) + tmp.seek(0) + + zzz = zipfile.ZipFile(tmp.name) + name_counter = Counter() + name_counter.update([f.filename for f in zzz.filelist]) + + problematic_names = { + name: count for name, count in name_counter.most_common() if count > 1 + } + + assert problematic_names == {}, "Duplicate entry in docx file's internal structure"