Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

maintenance: replace langcodes by babel #89

Merged
merged 4 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 12 additions & 16 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from typing import Any, Optional, Tuple
from urllib.parse import urlsplit

from langcodes import Language, tag_is_valid
from babel import Locale, UnknownLocaleError # type: ignore

from .langinfo import COUNTRY_CODES, LANGUAGE_CODES

Expand Down Expand Up @@ -181,23 +181,19 @@ def extension_filter(urlpath: str) -> bool:


def langcodes_score(language: str, segment: str, score: int) -> int:
"""Use langcodes on selected URL segments and integrate
them into a score."""
# see also: https://babel.pocoo.org/en/latest/locale.html
"""Use language codes or locale parser on selected URL segments and
integrate them into a score."""
# test if the code looks like a country or a language
if segment[:2] not in COUNTRY_CODES and segment[:2] not in LANGUAGE_CODES:
return score
# test if tag is valid (caution: private codes are)
if tag_is_valid(segment):
# try to identify language code
identified = Language.get(segment).language
# see if it matches
if identified is not None:
LOGGER.debug("langcode %s found in URL segment %s", identified, segment)
if identified != language:
score -= 1
else:
beginning = segment[:2]
if beginning in LANGUAGE_CODES or beginning in COUNTRY_CODES:
# use locale parser
try:
if Locale.parse(segment).language == language:
score += 1
else:
score -= 1
except UnknownLocaleError:
pass
return score


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def get_long_description():
include_package_data=True,
python_requires=">=3.6",
install_requires=[
"langcodes >= 3.3.0",
"babel >= 2.11.0",
"tld == 0.12.6; python_version < '3.7'",
"tld >= 0.13; python_version >= '3.7'",
"urllib3 >= 1.26, < 2; python_version < '3.7'",
Expand Down
12 changes: 11 additions & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@
lang_filter,
)
from courlan.core import filter_links
from courlan.filters import domain_filter, extension_filter, path_filter, type_filter
from courlan.filters import (
domain_filter,
extension_filter,
langcodes_score,
path_filter,
type_filter,
)
from courlan.meta import clear_caches
from courlan.urlutils import _parse, get_tldinfo, is_known_link

Expand Down Expand Up @@ -440,6 +446,10 @@ def test_lang_filter():
lang_filter("http://bz.berlin1.de/kino/050513/fans.html", "de", strict=True)
is False
)
assert langcodes_score("en", "en_HK", 0) == 1
assert langcodes_score("en", "en_XY", 0) == 0
assert langcodes_score("en", "de_DE", 0) == -1

# assert lang_filter('http://www.verfassungen.de/ch/basel/verf03.htm'. 'de') is True
# assert lang_filter('http://www.uni-stuttgart.de/hi/fnz/lehrveranst.html', 'de') is True
# http://www.wildwechsel.de/ww/front_content.php?idcatart=177&lang=4&client=6&a=view&eintrag=100&a=view&eintrag=0&a=view&eintrag=20&a=view&eintrag=80&a=view&eintrag=20
Expand Down
Loading