diff --git a/CHANGELOG.md b/CHANGELOG.md index a349c18a0b..c5ea67ac7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ ### Features * Add Jira Connector to be able to pull issues from a Jira organization +* Add `clean_ligatures` function to expand ligatures in text ### Fixes diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index 3f2b7a7775..eec8edd2b9 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -62,6 +62,32 @@ def test_clean_ordered_bullets(text, expected): assert core.clean_ordered_bullets(text=text) == expected +@pytest.mark.parametrize( + ("text", "expected"), + [ + ("The æther is a classic element.", "The aether is a classic element."), + ("In old texts, Æsop's fables are", "In old texts, AEsop's fables are"), + ("The buffer zone is there.", "The buffer zone is there."), + ("The file was found in the system.", "The file was found in the system."), + ("She had a flower in her hair.", "She had a flower in her hair."), + ("The coffin was placed in the grave.", "The coffin was placed in the grave."), + ("The buffle zone was clearly marked.", "The buffle zone was clearly marked."), + ("The craſtsman worked with dedication.", "The craftsman worked with dedication."), + ("The symbol ʪ is very rare.", "The symbol ls is very rare."), + ("The word 'cœur' means 'heart' in French.", "The word 'coeur' means 'heart' in French."), + ("The word 'Œuvre' refers to the works", "The word 'OEuvre' refers to the works"), + ("The ȹ symbol is used in some contexts.", "The qp symbol is used in some contexts."), + ("The postman delivers mail daily.", "The postman delivers mail daily."), + ( + "The symbol ʦ can be found in certain alphabets.", + "The symbol ts can be found in certain alphabets.", + ), + ], +) +def test_clean_ligatures(text, expected): + assert core.clean_ligatures(text=text) == expected + + @pytest.mark.parametrize( ("text", "expected"), [ diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 49c206844b..70682af42c 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -68,6 +68,37 @@ def clean_ordered_bullets(text) -> str: return text_cl +def clean_ligatures(text) -> str: + """Replaces ligatures with their most likely equivalent characters. + + Example + ------- + The benefits -> The benefits + High quality financial -> High quality financial + """ + ligatures_map = { + "æ": "ae", + "Æ": "AE", + "ff": "ff", + "fi": "fi", + "fl": "fl", + "ffi": "ffi", + "ffl": "ffl", + "ſt": "ft", + "ʪ": "ls", + "œ": "oe", + "Œ": "OE", + "ȹ": "qp", + "st": "st", + "ʦ": "ts", + } + cleaned_text: str = text + for k, v in ligatures_map.items(): + cleaned_text = cleaned_text.replace(k, v) + + return cleaned_text + + def group_bullet_paragraph(paragraph: str) -> list: """Groups paragraphs with bullets that have line breaks for visual/formatting purposes. For example: