diff --git a/haystack/components/preprocessors/document_preprocessor.py b/haystack/components/preprocessors/document_preprocessor.py index 7eb91f37fa..5b6fbf2434 100644 --- a/haystack/components/preprocessors/document_preprocessor.py +++ b/haystack/components/preprocessors/document_preprocessor.py @@ -186,9 +186,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "DocumentPreprocessor": :returns: Deserialized SuperComponent. """ - if "splitting_function" in data["init_parameters"]: - data["init_parameters"]["splitting_function"] = deserialize_callable( - data["init_parameters"]["splitting_function"] - ) - + splitting_function = data["init_parameters"].get("splitting_function", None) + if splitting_function: + data["init_parameters"]["splitting_function"] = deserialize_callable(splitting_function) return default_from_dict(cls, data) diff --git a/test/components/preprocessors/test_document_preprocessor.py b/test/components/preprocessors/test_document_preprocessor.py index 21d6ee5c8a..ee04c7a3e8 100644 --- a/test/components/preprocessors/test_document_preprocessor.py +++ b/test/components/preprocessors/test_document_preprocessor.py @@ -45,22 +45,29 @@ def test_init(self, preprocessor: DocumentPreprocessor) -> None: assert splitter.language == "en" def test_from_dict(self) -> None: - preprocessor = DocumentPreprocessor.from_dict( - { - "init_parameters": { - "remove_empty_lines": True, - "remove_extra_whitespaces": True, - "remove_repeated_substrings": False, - "keep_id": True, - "split_by": "word", - "split_length": 3, - "split_overlap": 1, - "respect_sentence_boundary": False, - "language": "en", - }, - "type": "haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor", - } - ) + data = { + "init_parameters": { + "remove_empty_lines": True, + "remove_extra_whitespaces": True, + "remove_repeated_substrings": False, + "keep_id": True, + "remove_substrings": None, + "remove_regex": None, + "unicode_normalization": None, + "ascii_only": False, + "split_by": "word", + "split_length": 3, + "split_overlap": 1, + "split_threshold": 0, + "splitting_function": None, + "respect_sentence_boundary": False, + "language": "en", + "use_split_rules": True, + "extend_abbreviations": True, + }, + "type": "haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor", + } + preprocessor = DocumentPreprocessor.from_dict(data) assert isinstance(preprocessor, DocumentPreprocessor) def test_to_dict(self, preprocessor: DocumentPreprocessor) -> None: