Skip to content

Commit

Permalink
Add clean_ligatures to core cleaners
Browse files Browse the repository at this point in the history
  • Loading branch information
walsha2 committed Sep 7, 2023
1 parent 09cc4bf commit 158de8d
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
### Features

* Add Jira Connector to be able to pull issues from a Jira organization
* Add `clean_ligatures` function to expand ligatures in text

### Fixes

Expand Down
26 changes: 26 additions & 0 deletions test_unstructured/cleaners/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,32 @@ def test_clean_ordered_bullets(text, expected):
assert core.clean_ordered_bullets(text=text) == expected


@pytest.mark.parametrize(
("text", "expected"),
[
("The æther is a classic element.", "The aether is a classic element."),
("In old texts, Æsop's fables are", "In old texts, AEsop's fables are"),
("The buffer zone is there.", "The buffer zone is there."),
("The file was found in the system.", "The file was found in the system."),
("She had a flower in her hair.", "She had a flower in her hair."),
("The coffin was placed in the grave.", "The coffin was placed in the grave."),
("The buffle zone was clearly marked.", "The buffle zone was clearly marked."),
("The craſtsman worked with dedication.", "The craftsman worked with dedication."),
("The symbol ʪ is very rare.", "The symbol ls is very rare."),
("The word 'cœur' means 'heart' in French.", "The word 'coeur' means 'heart' in French."),
("The word 'Œuvre' refers to the works", "The word 'OEuvre' refers to the works"),
("The ȹ symbol is used in some contexts.", "The qp symbol is used in some contexts."),
("The postman delivers mail daily.", "The postman delivers mail daily."),
(
"The symbol ʦ can be found in certain alphabets.",
"The symbol ts can be found in certain alphabets.",
),
],
)
def test_clean_ligatures(text, expected):
assert core.clean_ligatures(text=text) == expected


@pytest.mark.parametrize(
("text", "expected"),
[
Expand Down
31 changes: 31 additions & 0 deletions unstructured/cleaners/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,37 @@ def clean_ordered_bullets(text) -> str:
return text_cl


def clean_ligatures(text) -> str:
"""Replaces ligatures with their most likely equivalent characters.
Example
-------
The benefits -> The benefits
High quality financial -> High quality financial
"""
ligatures_map = {
"æ": "ae",
"Æ": "AE",
"ff": "ff",
"fi": "fi",
"fl": "fl",
"ffi": "ffi",
"ffl": "ffl",
"ſt": "ft",
"ʪ": "ls",
"œ": "oe",
"Œ": "OE",
"ȹ": "qp",
"st": "st",
"ʦ": "ts",
}
cleaned_text: str = text
for k, v in ligatures_map.items():
cleaned_text = cleaned_text.replace(k, v)

return cleaned_text


def group_bullet_paragraph(paragraph: str) -> list:
"""Groups paragraphs with bullets that have line breaks for visual/formatting purposes.
For example:
Expand Down

0 comments on commit 158de8d

Please sign in to comment.