From 158de8dd24d01cf83c6d48e59e5fad8d8ddcc9bd Mon Sep 17 00:00:00 2001
From: Wahab Alshahin <wahab.alshahin@gmail.com>
Date: Wed, 6 Sep 2023 23:29:59 -0400
Subject: [PATCH] Add clean_ligatures to core cleaners

---
 CHANGELOG.md                            |  1 +
 test_unstructured/cleaners/test_core.py | 26 +++++++++++++++++++++
 unstructured/cleaners/core.py           | 31 +++++++++++++++++++++++++
 3 files changed, 58 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a349c18a0b..c5ea67ac7e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
 ### Features
 
 * Add Jira Connector to be able to pull issues from a Jira organization
+* Add `clean_ligatures` function to expand ligatures in text
 
 ### Fixes
 
diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py
index 3f2b7a7775..eec8edd2b9 100644
--- a/test_unstructured/cleaners/test_core.py
+++ b/test_unstructured/cleaners/test_core.py
@@ -62,6 +62,32 @@ def test_clean_ordered_bullets(text, expected):
     assert core.clean_ordered_bullets(text=text) == expected
 
 
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [
+        ("The æther is a classic element.", "The aether is a classic element."),
+        ("In old texts, Æsop's fables are", "In old texts, AEsop's fables are"),
+        ("The buﬀer zone is there.", "The buffer zone is there."),
+        ("The ﬁle was found in the system.", "The file was found in the system."),
+        ("She had a ﬂower in her hair.", "She had a flower in her hair."),
+        ("The coﬃn was placed in the grave.", "The coffin was placed in the grave."),
+        ("The buﬄe zone was clearly marked.", "The buffle zone was clearly marked."),
+        ("The craﬅsman worked with dedication.", "The craftsman worked with dedication."),
+        ("The symbol ʪ is very rare.", "The symbol ls is very rare."),
+        ("The word 'cœur' means 'heart' in French.", "The word 'coeur' means 'heart' in French."),
+        ("The word 'Œuvre' refers to the works", "The word 'OEuvre' refers to the works"),
+        ("The ȹ symbol is used in some contexts.", "The qp symbol is used in some contexts."),
+        ("The poﬆman delivers mail daily.", "The postman delivers mail daily."),
+        (
+            "The symbol ʦ can be found in certain alphabets.",
+            "The symbol ts can be found in certain alphabets.",
+        ),
+    ],
+)
+def test_clean_ligatures(text, expected):
+    assert core.clean_ligatures(text=text) == expected
+
+
 @pytest.mark.parametrize(
     ("text", "expected"),
     [
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index 49c206844b..70682af42c 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -68,6 +68,37 @@ def clean_ordered_bullets(text) -> str:
     return text_cl
 
 
+def clean_ligatures(text) -> str:
+    """Replaces ligatures with their most likely equivalent characters.
+
+    Example
+    -------
+    The beneﬁts -> The benefits
+    High quality ﬁnancial -> High quality financial
+    """
+    ligatures_map = {
+        "æ": "ae",
+        "Æ": "AE",
+        "ﬀ": "ff",
+        "ﬁ": "fi",
+        "ﬂ": "fl",
+        "ﬃ": "ffi",
+        "ﬄ": "ffl",
+        "ﬅ": "ft",
+        "ʪ": "ls",
+        "œ": "oe",
+        "Œ": "OE",
+        "ȹ": "qp",
+        "ﬆ": "st",
+        "ʦ": "ts",
+    }
+    cleaned_text: str = text
+    for k, v in ligatures_map.items():
+        cleaned_text = cleaned_text.replace(k, v)
+
+    return cleaned_text
+
+
 def group_bullet_paragraph(paragraph: str) -> list:
     """Groups paragraphs with bullets that have line breaks for visual/formatting purposes.
     For example: