Skip to content

Commit

Permalink
feat: add unicode normalization & ascii_only mode for DocumentCleaner (
Browse files Browse the repository at this point in the history
…#8103)

* feat: add unicode normalization & ascii_only mode for DocumentCleaner.

* feat: add unicode_normalization parameter valdiation to DocumentCleaner.

* test: fix the unit test to work after code linting.
  • Loading branch information
twellck committed Aug 5, 2024
1 parent e17d0c4 commit 2e2f5f1
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 1 deletion.
55 changes: 54 additions & 1 deletion haystack/components/preprocessors/document_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from copy import deepcopy
from functools import partial, reduce
from itertools import chain
from typing import Generator, List, Optional, Set
from typing import Generator, List, Literal, Optional, Set
from unicodedata import normalize

from haystack import Document, component, logging

Expand Down Expand Up @@ -45,6 +46,8 @@ def __init__(
keep_id: bool = False,
remove_substrings: Optional[List[str]] = None,
remove_regex: Optional[str] = None,
unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
ascii_only: bool = False,
):
"""
Initialize DocumentCleaner.
Expand All @@ -57,14 +60,34 @@ def __init__(
:param remove_substrings: List of substrings to remove from the text.
:param remove_regex: Regex to match and replace substrings by "".
:param keep_id: If `True`, keeps the IDs of the original documents.
:param unicode_normalization: Unicode normalization form to apply to the text.
Note: This will run before any other steps.
:param ascii_only: Whether to convert the text to ASCII only.
Will remove accents from characters and replace them with ASCII characters.
Other non-ASCII characters will be removed.
Note: This will run before any pattern matching or removal.
"""

self._validate_params(unicode_normalization=unicode_normalization)

self.remove_empty_lines = remove_empty_lines
self.remove_extra_whitespaces = remove_extra_whitespaces
self.remove_repeated_substrings = remove_repeated_substrings
self.remove_substrings = remove_substrings
self.remove_regex = remove_regex
self.keep_id = keep_id
self.unicode_normalization = unicode_normalization
self.ascii_only = ascii_only

def _validate_params(self, unicode_normalization: Optional[str]):
"""
Validate the parameters of the DocumentCleaner.
:param unicode_normalization: Unicode normalization form to apply to the text.
:raises ValueError: if the parameters are not valid.
"""
if unicode_normalization and unicode_normalization not in ["NFC", "NFKC", "NFD", "NFKD"]:
raise ValueError("unicode_normalization must be one of 'NFC', 'NFKC', 'NFD', 'NFKD'.")

@component.output_types(documents=List[Document])
def run(self, documents: List[Document]):
Expand Down Expand Up @@ -93,6 +116,10 @@ def run(self, documents: List[Document]):
continue
text = doc.content

if self.unicode_normalization:
text = self._normalize_unicode(text, self.unicode_normalization)
if self.ascii_only:
text = self._ascii_only(text)
if self.remove_extra_whitespaces:
text = self._remove_extra_whitespaces(text)
if self.remove_empty_lines:
Expand All @@ -108,6 +135,32 @@ def run(self, documents: List[Document]):

return {"documents": cleaned_docs}

def _normalize_unicode(self, text: str, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> str:
"""
Normalize the unicode of the text.
:param text: Text to normalize.
:param form: Unicode normalization form to apply to the text.
Options: "NFC", "NFKC", "NFD", "NFKD".
:returns: The normalized text.
"""
return normalize(form, text)

def _ascii_only(self, text: str) -> str:
"""
Convert the text to ASCII only.
Will remove accents from characters and replace them with ASCII characters.
Other non-ASCII characters will be removed.
:param text: Text to convert to ASCII only.
:returns: The text in ASCII only.
"""

# First normalize the text to NFKD to separate the characters and their diacritics
# Then encode it to ASCII and ignore any characters that can't be encoded
return self._normalize_unicode(text, "NFKD").encode("ascii", "ignore").decode("utf-8")

def _remove_empty_lines(self, text: str) -> str:
"""
Remove empty lines and lines that contain nothing but whitespaces from text.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
enhancements:
- |
Added `unicode_normalization` parameter to the DocumentCleaner, allowing to normalize the text to NFC, NFD, NFKC, or NFKD.
- |
Added `ascii_only` parameter to the DocumentCleaner, transforming letters with diacritics to their ASCII equivalent and removing other non-ASCII characters.
65 changes: 65 additions & 0 deletions test/components/preprocessors/test_document_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,68 @@ def test_keep_id_does_not_alter_document_ids(self):
assert len(result["documents"]) == 2
assert result["documents"][0].id == "1"
assert result["documents"][1].id == "2"

def test_unicode_normalization(self):
text = """\
アイウエオ
Comment ça va
مرحبا بالعالم
em Space"""

expected_text_NFC = """\
アイウエオ
Comment ça va
مرحبا بالعالم
em Space"""

expected_text_NFD = """\
アイウエオ
Comment ça va
مرحبا بالعالم
em Space"""

expected_text_NFKC = """\
アイウエオ
Comment ça va
مرحبا بالعالم
em Space"""

expected_text_NFKD = """\
アイウエオ
Comment ça va
مرحبا بالعالم
em Space"""

nfc_cleaner = DocumentCleaner(unicode_normalization="NFC", remove_extra_whitespaces=False)
nfd_cleaner = DocumentCleaner(unicode_normalization="NFD", remove_extra_whitespaces=False)
nfkc_cleaner = DocumentCleaner(unicode_normalization="NFKC", remove_extra_whitespaces=False)
nfkd_cleaner = DocumentCleaner(unicode_normalization="NFKD", remove_extra_whitespaces=False)

nfc_result = nfc_cleaner.run(documents=[Document(content=text)])
nfd_result = nfd_cleaner.run(documents=[Document(content=text)])
nfkc_result = nfkc_cleaner.run(documents=[Document(content=text)])
nfkd_result = nfkd_cleaner.run(documents=[Document(content=text)])

assert nfc_result["documents"][0].content == expected_text_NFC
assert nfd_result["documents"][0].content == expected_text_NFD
assert nfkc_result["documents"][0].content == expected_text_NFKC
assert nfkd_result["documents"][0].content == expected_text_NFKD

def test_ascii_only(self):
text = """\
アイウエオ
Comment ça va
Á
مرحبا بالعالم
em Space"""

expected_text = """\
\n\
Comment ca va
A
\n\
em Space"""

cleaner = DocumentCleaner(ascii_only=True, remove_extra_whitespaces=False, remove_empty_lines=False)
result = cleaner.run(documents=[Document(content=text)])
assert result["documents"][0].content == expected_text

0 comments on commit 2e2f5f1

Please sign in to comment.