diff --git a/haystack/components/preprocessors/document_cleaner.py b/haystack/components/preprocessors/document_cleaner.py index 233dfed50a..282728ea74 100644 --- a/haystack/components/preprocessors/document_cleaner.py +++ b/haystack/components/preprocessors/document_cleaner.py @@ -6,7 +6,8 @@ from copy import deepcopy from functools import partial, reduce from itertools import chain -from typing import Generator, List, Optional, Set +from typing import Generator, List, Literal, Optional, Set +from unicodedata import normalize from haystack import Document, component, logging @@ -45,6 +46,8 @@ def __init__( keep_id: bool = False, remove_substrings: Optional[List[str]] = None, remove_regex: Optional[str] = None, + unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None, + ascii_only: bool = False, ): """ Initialize DocumentCleaner. @@ -57,14 +60,34 @@ def __init__( :param remove_substrings: List of substrings to remove from the text. :param remove_regex: Regex to match and replace substrings by "". :param keep_id: If `True`, keeps the IDs of the original documents. + :param unicode_normalization: Unicode normalization form to apply to the text. + Note: This will run before any other steps. + :param ascii_only: Whether to convert the text to ASCII only. + Will remove accents from characters and replace them with ASCII characters. + Other non-ASCII characters will be removed. + Note: This will run before any pattern matching or removal. """ + self._validate_params(unicode_normalization=unicode_normalization) + self.remove_empty_lines = remove_empty_lines self.remove_extra_whitespaces = remove_extra_whitespaces self.remove_repeated_substrings = remove_repeated_substrings self.remove_substrings = remove_substrings self.remove_regex = remove_regex self.keep_id = keep_id + self.unicode_normalization = unicode_normalization + self.ascii_only = ascii_only + + def _validate_params(self, unicode_normalization: Optional[str]): + """ + Validate the parameters of the DocumentCleaner. + + :param unicode_normalization: Unicode normalization form to apply to the text. + :raises ValueError: if the parameters are not valid. + """ + if unicode_normalization and unicode_normalization not in ["NFC", "NFKC", "NFD", "NFKD"]: + raise ValueError("unicode_normalization must be one of 'NFC', 'NFKC', 'NFD', 'NFKD'.") @component.output_types(documents=List[Document]) def run(self, documents: List[Document]): @@ -93,6 +116,10 @@ def run(self, documents: List[Document]): continue text = doc.content + if self.unicode_normalization: + text = self._normalize_unicode(text, self.unicode_normalization) + if self.ascii_only: + text = self._ascii_only(text) if self.remove_extra_whitespaces: text = self._remove_extra_whitespaces(text) if self.remove_empty_lines: @@ -108,6 +135,32 @@ def run(self, documents: List[Document]): return {"documents": cleaned_docs} + def _normalize_unicode(self, text: str, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> str: + """ + Normalize the unicode of the text. + + :param text: Text to normalize. + :param form: Unicode normalization form to apply to the text. + Options: "NFC", "NFKC", "NFD", "NFKD". + :returns: The normalized text. + """ + return normalize(form, text) + + def _ascii_only(self, text: str) -> str: + """ + Convert the text to ASCII only. + + Will remove accents from characters and replace them with ASCII characters. + Other non-ASCII characters will be removed. + + :param text: Text to convert to ASCII only. + :returns: The text in ASCII only. + """ + + # First normalize the text to NFKD to separate the characters and their diacritics + # Then encode it to ASCII and ignore any characters that can't be encoded + return self._normalize_unicode(text, "NFKD").encode("ascii", "ignore").decode("utf-8") + def _remove_empty_lines(self, text: str) -> str: """ Remove empty lines and lines that contain nothing but whitespaces from text. diff --git a/releasenotes/notes/add-unicode-normalization-and-ascii-mode-to-document-cleaner-ba536b46e499663c.yaml b/releasenotes/notes/add-unicode-normalization-and-ascii-mode-to-document-cleaner-ba536b46e499663c.yaml new file mode 100644 index 0000000000..d4d28ee47b --- /dev/null +++ b/releasenotes/notes/add-unicode-normalization-and-ascii-mode-to-document-cleaner-ba536b46e499663c.yaml @@ -0,0 +1,6 @@ +--- +enhancements: + - | + Added `unicode_normalization` parameter to the DocumentCleaner, allowing to normalize the text to NFC, NFD, NFKC, or NFKD. + - | + Added `ascii_only` parameter to the DocumentCleaner, transforming letters with diacritics to their ASCII equivalent and removing other non-ASCII characters. diff --git a/test/components/preprocessors/test_document_cleaner.py b/test/components/preprocessors/test_document_cleaner.py index 0acd9e8e8c..9bd5df549e 100644 --- a/test/components/preprocessors/test_document_cleaner.py +++ b/test/components/preprocessors/test_document_cleaner.py @@ -139,3 +139,68 @@ def test_keep_id_does_not_alter_document_ids(self): assert len(result["documents"]) == 2 assert result["documents"][0].id == "1" assert result["documents"][1].id == "2" + + def test_unicode_normalization(self): + text = """\ + アイウエオ + Comment ça va + مرحبا بالعالم + em Space""" + + expected_text_NFC = """\ + アイウエオ + Comment ça va + مرحبا بالعالم + em Space""" + + expected_text_NFD = """\ + アイウエオ + Comment ça va + مرحبا بالعالم + em Space""" + + expected_text_NFKC = """\ + アイウエオ + Comment ça va + مرحبا بالعالم + em Space""" + + expected_text_NFKD = """\ + アイウエオ + Comment ça va + مرحبا بالعالم + em Space""" + + nfc_cleaner = DocumentCleaner(unicode_normalization="NFC", remove_extra_whitespaces=False) + nfd_cleaner = DocumentCleaner(unicode_normalization="NFD", remove_extra_whitespaces=False) + nfkc_cleaner = DocumentCleaner(unicode_normalization="NFKC", remove_extra_whitespaces=False) + nfkd_cleaner = DocumentCleaner(unicode_normalization="NFKD", remove_extra_whitespaces=False) + + nfc_result = nfc_cleaner.run(documents=[Document(content=text)]) + nfd_result = nfd_cleaner.run(documents=[Document(content=text)]) + nfkc_result = nfkc_cleaner.run(documents=[Document(content=text)]) + nfkd_result = nfkd_cleaner.run(documents=[Document(content=text)]) + + assert nfc_result["documents"][0].content == expected_text_NFC + assert nfd_result["documents"][0].content == expected_text_NFD + assert nfkc_result["documents"][0].content == expected_text_NFKC + assert nfkd_result["documents"][0].content == expected_text_NFKD + + def test_ascii_only(self): + text = """\ + アイウエオ + Comment ça va + Á + مرحبا بالعالم + em Space""" + + expected_text = """\ + \n\ + Comment ca va + A + \n\ + em Space""" + + cleaner = DocumentCleaner(ascii_only=True, remove_extra_whitespaces=False, remove_empty_lines=False) + result = cleaner.run(documents=[Document(content=text)]) + assert result["documents"][0].content == expected_text