feat: add unicode normalization & ascii_only mode for DocumentCleaner (…

…#8103) * feat: add unicode normalization & ascii_only mode for DocumentCleaner. * feat: add unicode_normalization parameter valdiation to DocumentCleaner. * test: fix the unit test to work after code linting.
deepset-ai · Aug 5, 2024 · 2e2f5f1 · 2e2f5f1
1 parent e17d0c4
commit 2e2f5f1
Show file tree

Hide file tree

Showing 3 changed files with 125 additions and 1 deletion.
diff --git a/haystack/components/preprocessors/document_cleaner.py b/haystack/components/preprocessors/document_cleaner.py
@@ -6,7 +6,8 @@
 from copy import deepcopy
 from functools import partial, reduce
 from itertools import chain
-from typing import Generator, List, Optional, Set
+from typing import Generator, List, Literal, Optional, Set
+from unicodedata import normalize
 
 from haystack import Document, component, logging
 
@@ -45,6 +46,8 @@ def __init__(
  keep_id: bool = False,
  remove_substrings: Optional[List[str]] = None,
  remove_regex: Optional[str] = None,
+ unicode_normalization: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
+ ascii_only: bool = False,
  ):
  """
  Initialize DocumentCleaner.
@@ -57,14 +60,34 @@ def __init__(
  :param remove_substrings: List of substrings to remove from the text.
  :param remove_regex: Regex to match and replace substrings by "".
  :param keep_id: If `True`, keeps the IDs of the original documents.
+ :param unicode_normalization: Unicode normalization form to apply to the text.
+ Note: This will run before any other steps.
+ :param ascii_only: Whether to convert the text to ASCII only.
+ Will remove accents from characters and replace them with ASCII characters.
+ Other non-ASCII characters will be removed.
+ Note: This will run before any pattern matching or removal.
  """
 
+ self._validate_params(unicode_normalization=unicode_normalization)
+
  self.remove_empty_lines = remove_empty_lines
  self.remove_extra_whitespaces = remove_extra_whitespaces
  self.remove_repeated_substrings = remove_repeated_substrings
  self.remove_substrings = remove_substrings
  self.remove_regex = remove_regex
  self.keep_id = keep_id
+ self.unicode_normalization = unicode_normalization
+ self.ascii_only = ascii_only
+
+ def _validate_params(self, unicode_normalization: Optional[str]):
+ """
+ Validate the parameters of the DocumentCleaner.
+
+ :param unicode_normalization: Unicode normalization form to apply to the text.
+ :raises ValueError: if the parameters are not valid.
+ """
+ if unicode_normalization and unicode_normalization not in ["NFC", "NFKC", "NFD", "NFKD"]:
+ raise ValueError("unicode_normalization must be one of 'NFC', 'NFKC', 'NFD', 'NFKD'.")
 
  @component.output_types(documents=List[Document])
  def run(self, documents: List[Document]):
@@ -93,6 +116,10 @@ def run(self, documents: List[Document]):
  continue
  text = doc.content
 
+ if self.unicode_normalization:
+ text = self._normalize_unicode(text, self.unicode_normalization)
+ if self.ascii_only:
+ text = self._ascii_only(text)
  if self.remove_extra_whitespaces:
  text = self._remove_extra_whitespaces(text)
  if self.remove_empty_lines:
@@ -108,6 +135,32 @@ def run(self, documents: List[Document]):
 
  return {"documents": cleaned_docs}
 
+ def _normalize_unicode(self, text: str, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> str:
+ """
+ Normalize the unicode of the text.
+
+ :param text: Text to normalize.
+ :param form: Unicode normalization form to apply to the text.
+ Options: "NFC", "NFKC", "NFD", "NFKD".
+ :returns: The normalized text.
+ """
+ return normalize(form, text)
+
+ def _ascii_only(self, text: str) -> str:
+ """
+ Convert the text to ASCII only.
+
+ Will remove accents from characters and replace them with ASCII characters.
+ Other non-ASCII characters will be removed.
+
+ :param text: Text to convert to ASCII only.
+ :returns: The text in ASCII only.
+ """
+
+ # First normalize the text to NFKD to separate the characters and their diacritics
+ # Then encode it to ASCII and ignore any characters that can't be encoded
+ return self._normalize_unicode(text, "NFKD").encode("ascii", "ignore").decode("utf-8")
+
  def _remove_empty_lines(self, text: str) -> str:
  """
  Remove empty lines and lines that contain nothing but whitespaces from text.

diff --git a/.../notes/add-unicode-normalization-and-ascii-mode-to-document-cleaner-ba536b46e499663c.yaml b/.../notes/add-unicode-normalization-and-ascii-mode-to-document-cleaner-ba536b46e499663c.yaml
@@ -0,0 +1,6 @@
+---
+enhancements:
+ - |
+ Added `unicode_normalization` parameter to the DocumentCleaner, allowing to normalize the text to NFC, NFD, NFKC, or NFKD.
+ - |
+ Added `ascii_only` parameter to the DocumentCleaner, transforming letters with diacritics to their ASCII equivalent and removing other non-ASCII characters.
diff --git a/test/components/preprocessors/test_document_cleaner.py b/test/components/preprocessors/test_document_cleaner.py
@@ -139,3 +139,68 @@ def test_keep_id_does_not_alter_document_ids(self):
  assert len(result["documents"]) == 2
  assert result["documents"][0].id == "1"
  assert result["documents"][1].id == "2"
+
+ def test_unicode_normalization(self):
+ text = """\
+ ｱｲｳｴｵ
+ Comment ça va
+ مرحبا بالعالم
+ em Space"""
+
+ expected_text_NFC = """\
+ ｱｲｳｴｵ
+ Comment ça va
+ مرحبا بالعالم
+ em Space"""
+
+ expected_text_NFD = """\
+ ｱｲｳｴｵ
+ Comment ça va
+ مرحبا بالعالم
+ em Space"""
+
+ expected_text_NFKC = """\
+ アイウエオ
+ Comment ça va
+ مرحبا بالعالم
+ em Space"""
+
+ expected_text_NFKD = """\
+ アイウエオ
+ Comment ça va
+ مرحبا بالعالم
+ em Space"""
+
+ nfc_cleaner = DocumentCleaner(unicode_normalization="NFC", remove_extra_whitespaces=False)
+ nfd_cleaner = DocumentCleaner(unicode_normalization="NFD", remove_extra_whitespaces=False)
+ nfkc_cleaner = DocumentCleaner(unicode_normalization="NFKC", remove_extra_whitespaces=False)
+ nfkd_cleaner = DocumentCleaner(unicode_normalization="NFKD", remove_extra_whitespaces=False)
+
+ nfc_result = nfc_cleaner.run(documents=[Document(content=text)])
+ nfd_result = nfd_cleaner.run(documents=[Document(content=text)])
+ nfkc_result = nfkc_cleaner.run(documents=[Document(content=text)])
+ nfkd_result = nfkd_cleaner.run(documents=[Document(content=text)])
+
+ assert nfc_result["documents"][0].content == expected_text_NFC
+ assert nfd_result["documents"][0].content == expected_text_NFD
+ assert nfkc_result["documents"][0].content == expected_text_NFKC
+ assert nfkd_result["documents"][0].content == expected_text_NFKD
+
+ def test_ascii_only(self):
+ text = """\
+ ｱｲｳｴｵ
+ Comment ça va
+ Á
+ مرحبا بالعالم
+ em Space"""
+
+ expected_text = """\
+ \n\
+ Comment ca va
+ A
+ \n\
+ em Space"""
+
+ cleaner = DocumentCleaner(ascii_only=True, remove_extra_whitespaces=False, remove_empty_lines=False)
+ result = cleaner.run(documents=[Document(content=text)])
+ assert result["documents"][0].content == expected_text