Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added ignore_script and tested it. #17

Merged
merged 2 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions langcodes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,14 +674,16 @@ def match_score(self, supported: 'Language') -> int:
)
return 100 - min(self.distance(supported), 100)

def distance(self, supported: 'Language') -> int:
def distance(self, supported: 'Language', ignore_script: bool = False) -> int:
"""
Suppose that `self` is the language that the user desires, and
`supported` is a language that is actually supported.

This method returns a number from 0 to 134 measuring the 'distance'
between the languages (lower numbers are better). This is not a
symmetric relation.
symmetric relation. If `ignore_script` is `True`, the script will
not be used in the comparison, possibly resulting in a smaller
'distance'.

The language distance is not really about the linguistic similarity or
history of the languages; instead, it's based largely on sociopolitical
Expand All @@ -703,9 +705,10 @@ def distance(self, supported: 'Language') -> int:
desired_triple = ('und', 'Zzzz', 'ZZ')
else:
desired_complete = self.prefer_macrolanguage().maximize()

desired_triple = (
desired_complete.language,
desired_complete.script,
None if ignore_script else desired_complete.script,
desired_complete.territory,
)

Expand All @@ -717,9 +720,10 @@ def distance(self, supported: 'Language') -> int:
supported_triple = ('und', 'Zzzz', 'ZZ')
else:
supported_complete = supported.prefer_macrolanguage().maximize()

supported_triple = (
supported_complete.language,
supported_complete.script,
None if ignore_script else supported_complete.script,
supported_complete.territory,
)

Expand Down Expand Up @@ -1648,7 +1652,7 @@ def tag_match_score(
return desired_ld.match_score(supported_ld)


def tag_distance(desired: Union[str, Language], supported: Union[str, Language]) -> int:
def tag_distance(desired: Union[str, Language], supported: Union[str, Language], ignore_script: bool = False) -> int:
"""
Tags that expand to the same thing when likely values are filled in get a
distance of 0.
Expand Down Expand Up @@ -1792,13 +1796,19 @@ def tag_distance(desired: Union[str, Language], supported: Union[str, Language])
>>> tag_distance('ja', 'ja-Latn-US-hepburn')
54

If `ignore_script` is used, the script difference is ignored and a smaller
difference with lower score will be found.

>>> tag_distance('ja', 'ja-Latn-hepburn', ignore_script=True)
0

>>> # You can read the Shavian script, right?
>>> tag_distance('en', 'en-Shaw')
54
"""
desired_obj = Language.get(desired)
supported_obj = Language.get(supported)
return desired_obj.distance(supported_obj)
return desired_obj.distance(supported_obj, ignore_script)


def best_match(
Expand Down Expand Up @@ -1835,6 +1845,7 @@ def closest_match(
desired_language: Union[str, Language],
supported_languages: Sequence[str],
max_distance: int = 25,
ignore_script: bool = False,
) -> Tuple[str, int]:
"""
You have software that supports any of the `supported_languages`. You want
Expand All @@ -1854,6 +1865,9 @@ def closest_match(
the wrong language. The documentation for `tag_distance` describes the
distance values in more detail.

`ignore_script` makes the matching ignore scripts, allowing matches to be
found when they wouldn't otherwise be due to different scripts.

When there is a tie for the best matching language, the first one in the
tie will be used.

Expand All @@ -1871,6 +1885,9 @@ def closest_match(

>>> closest_match('ja', ['ja-Latn-hepburn', 'en'])
('und', 1000)

>>> closest_match('ja', ['ja-Latn-hepburn', 'en'], ignore_script=True)
('ja-Latn-hepburn', 0)
"""
desired_language = str(desired_language)

Expand All @@ -1884,7 +1901,7 @@ def closest_match(
return desired_language, 0

match_distances = [
(supported, tag_distance(desired_language, supported))
(supported, tag_distance(desired_language, supported, ignore_script))
for supported in supported_languages
]
match_distances = [
Expand Down
21 changes: 21 additions & 0 deletions langcodes/tests/test_issue_59.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from langcodes import closest_match


def test_language_less_than():
spoken_language_1 = 'pa'
spoken_language_2 = 'pa-PK'
match = closest_match(
spoken_language_1, [spoken_language_2], ignore_script=True
)
print(match)
georgkrause marked this conversation as resolved.
Show resolved Hide resolved
assert match[0] != "und"


def test_language_more_than():
spoken_language_1 = 'pa-PK'
spoken_language_2 = 'pa'
match = closest_match(
spoken_language_1, [spoken_language_2], ignore_script=True
)
print(match)
assert match[0] != "und"