From 6d68086f8e41ac5ce44a554a5116dcbbe7806c50 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 18 Nov 2024 13:00:11 +0000 Subject: [PATCH 1/3] Move to real classes for i18n classes for proper typing in strict mode --- src/zimscraperlib/i18n.py | 184 ++++++++++++++++---------------------- tests/i18n/test_i18n.py | 132 ++++++++++++++++----------- 2 files changed, 156 insertions(+), 160 deletions(-) diff --git a/src/zimscraperlib/i18n.py b/src/zimscraperlib/i18n.py index a442d81f..021fbc82 100644 --- a/src/zimscraperlib/i18n.py +++ b/src/zimscraperlib/i18n.py @@ -1,13 +1,10 @@ -#!/usr/bin/env python3 -# vim: ai ts=4 sts=4 et sw=4 nu - from __future__ import annotations import re import babel -import iso639 -import iso639.exceptions +import iso639 # pyright: ignore[reportMissingTypeStubs] +import iso639.exceptions # pyright: ignore[reportMissingTypeStubs] ISO_LEVELS = ["1", "2b", "2t", "3", "5"] @@ -16,57 +13,61 @@ class NotFoundError(ValueError): pass -class Lang(dict): - - @property - def iso_639_1(self) -> str | None: - """ISO-639-1 language code""" - return self["iso-639-1"] - - @property - def iso_639_2b(self) -> str | None: - """ISO-639-2b language code""" - return self["iso-639-2b"] - - @property - def iso_639_2t(self) -> str | None: - """ISO-639-2t language code""" - return self["iso-639-2t"] - - @property - def iso_639_3(self) -> str | None: - """ISO-639-3 language code""" - return self["iso-639-3"] - - @property - def iso_639_5(self) -> str | None: - """ISO-639-5 language code""" - return self["iso-639-5"] - - @property - def english(self) -> str: - """language name in English""" - return self["english"] - - @property - def native(self) -> str: - """language name in native language""" - return self["native"] - - @property - def iso_types(self) -> list[str]: - """list of supported iso types""" - return self["iso_types"] - - @property - def query(self) -> str: - """Query issued for these language details""" - return self["query"] - - @property - def querytype(self) -> str: - """Type of query issued to retrieve language details""" - return self["querytype"] +class Lang: + + def __init__(self, requested_lang: str, iso639_lang_obj: iso639.Lang): + self.iso_639_1 = iso639_lang_obj.pt1 or None + self.iso_639_2b = iso639_lang_obj.pt2b or None + self.iso_639_2t = iso639_lang_obj.pt2t or None + self.iso_639_3 = iso639_lang_obj.pt3 or None + self.iso_639_5 = iso639_lang_obj.pt5 or None + self.english = iso639_lang_obj.name or None + self.iso_types = [ + part_level + for iso_level, part_level in [ + (f"pt{level}", f"part{level}") for level in ISO_LEVELS + ] + + [("name", "name")] + if getattr(iso639_lang_obj, iso_level).lower() == requested_lang.lower() + ] + + +class LangAndDetails: + def __init__( + self, lang: Lang, english_name: str, native: str, querytype: str, query: str + ): + self.iso_639_1 = lang.iso_639_1 + self.iso_639_2b = lang.iso_639_2b + self.iso_639_2t = lang.iso_639_2t + self.iso_639_3 = lang.iso_639_3 + self.iso_639_5 = lang.iso_639_5 + self.iso_types = lang.iso_types + self.english = english_name + self.native = native + self.querytype = querytype + self.query = query + + def __eq__(self, value: object) -> bool: + if not isinstance(value, LangAndDetails): + return False + + return ( + self.iso_639_1 == value.iso_639_1 + and self.iso_639_2b == value.iso_639_2b + and self.iso_639_2t == value.iso_639_2t + and self.iso_639_3 == value.iso_639_3 + and self.iso_639_5 == value.iso_639_5 + and self.english == value.english + and self.native == value.native + ) + + def __repr__(self) -> str: + return ( + f"iso_639_1:{self.iso_639_1}, iso_639_2b:{self.iso_639_2b}, " + f"iso_639_2t:{self.iso_639_2t}, iso_639_3:{self.iso_639_3}, " + f"iso_639_5:{self.iso_639_5}, iso_639_5:{self.english}, " + f"iso_639_5:{self.native}" + ) def get_iso_lang_data(lang: str) -> tuple[Lang, Lang | None]: @@ -74,8 +75,6 @@ def get_iso_lang_data(lang: str) -> tuple[Lang, Lang | None]: Returns a tuple (main_language, macro_language | None)""" - iso_types = [] - try: isolang = iso639.Lang(lang) except ( @@ -84,39 +83,16 @@ def get_iso_lang_data(lang: str) -> tuple[Lang, Lang | None]: ) as exc: raise NotFoundError("Not a valid iso language name/code") from exc - def replace_types(new_type: str) -> str: - # convert new iso_types from iso639-lang Pypi package to old iso_types from - # iso-639 package, since we were returning these values for a long time - if new_type == "pt1": - return "part1" - elif new_type == "pt2b": - return "part2b" - elif new_type == "pt2t": - return "part2t" - elif new_type == "pt3": - return "part3" - elif new_type == "pt5": - return "part5" - return new_type - - for code_type in [f"pt{lang_}" for lang_ in ISO_LEVELS] + ["name"]: - # the `if` condition below is a bit hackish but it is the only way to know - # if the passed value is matching a code type or not with new python-i639 - # library and we do not expect weird things to happen here - if str(getattr(isolang, code_type)).lower() == lang.lower(): - iso_types.append(replace_types(code_type)) - - lang_data = Lang( - **{f"iso-639-{lang_}": getattr(isolang, f"pt{lang_}") for lang_ in ISO_LEVELS} - ) - lang_data.update({"english": isolang.name, "iso_types": iso_types}) - - # first item in the returned tuple + ourlang = Lang(lang, isolang) + macro = isolang.macro() - return (lang_data, get_iso_lang_data(macro.name)[0] if macro else None) + + return (ourlang, get_iso_lang_data(macro.name)[0] if macro else None) -def find_language_names(query: str, lang_data: Lang | None = None) -> tuple[str, str]: +def find_language_names( + query: str, lang_data: Lang | LangAndDetails | None = None +) -> tuple[str, str]: """(native, english) language names for lang with help from lang_data Falls back to English name if available or query if not""" @@ -134,30 +110,33 @@ def find_language_names(query: str, lang_data: Lang | None = None) -> tuple[str, pass # ISO code lookup order matters (most qualified first)! - for iso_level in [f"iso-639-{lang_}" for lang_ in reversed(ISO_LEVELS)]: + for iso_level in [f"iso_639_{level}" for level in reversed(ISO_LEVELS)]: try: - query_locale = babel.Locale.parse(lang_data.get(iso_level)) + query_locale = babel.Locale.parse(getattr(lang_data, iso_level)) if native_display_name := query_locale.get_display_name(): if english_display_name := query_locale.get_display_name("en"): return native_display_name, english_display_name except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError): pass - default = lang_data.get("english") or query + default = lang_data.english or query return default, default def update_with_macro(lang_data: Lang, macro_data: Lang | None): """update empty keys from lang_data with ones of macro_data""" - if macro_data: - for key, value in macro_data.items(): - if key in lang_data and not lang_data.get(key): - lang_data[key] = value + if not macro_data: + return lang_data + + for iso_level in [f"iso_639_{level}" for level in ISO_LEVELS]: + if not getattr(lang_data, iso_level): + setattr(lang_data, iso_level, getattr(macro_data, iso_level)) + return lang_data def get_language_details( query: str, failsafe: bool | None = False # noqa: FBT002 -) -> Lang | None: +) -> LangAndDetails | None: """language details dict from query. When query fails, either raises NotFoundError or return None, based on failsafe @@ -191,17 +170,10 @@ def get_language_details( iso_data = update_with_macro(lang_data, macro_data) native_name, english_name = find_language_names(native_query, iso_data) - iso_data.update( - { - "english": english_name, - "native": native_name, - "querytype": query_type, - "query": query, - } - ) - return iso_data + return LangAndDetails(iso_data, english_name, native_name, query_type, query) def is_valid_iso_639_3(code: str) -> bool: """whether code is a valid ISO-639-3 code""" - return (get_language_details(code, failsafe=True) or {}).get("iso-639-3") == code + lang = get_language_details(code, failsafe=True) + return lang is not None and lang.iso_639_3 == code diff --git a/tests/i18n/test_i18n.py b/tests/i18n/test_i18n.py index 2c9720d2..67353eff 100644 --- a/tests/i18n/test_i18n.py +++ b/tests/i18n/test_i18n.py @@ -1,12 +1,9 @@ -#!/usr/bin/env python3 -# vim: ai ts=4 sts=4 et sw=4 nu - +from typing import Any from unittest.mock import Mock import pytest from zimscraperlib.i18n import ( - Lang, NotFoundError, find_language_names, get_language_details, @@ -23,7 +20,7 @@ "iso-639-2b": "chi", "iso-639-2t": "zho", "iso-639-3": "zho", - "iso-639-5": "", + "iso-639-5": None, "english": "Chinese (Simplified)", "iso_types": ["part1"], "querytype": "locale", @@ -38,7 +35,7 @@ "iso-639-2b": "hin", "iso-639-2t": "hin", "iso-639-3": "hin", - "iso-639-5": "", + "iso-639-5": None, "english": "Hindi", "iso_types": ["part1"], "querytype": "purecode", @@ -53,7 +50,7 @@ "iso-639-2b": "hin", "iso-639-2t": "hin", "iso-639-3": "hin", - "iso-639-5": "", + "iso-639-5": None, "english": "Hindi (India)", "iso_types": ["part2b", "part2t", "part3"], "querytype": "purecode", @@ -68,7 +65,7 @@ "iso-639-2b": "jpn", "iso-639-2t": "jpn", "iso-639-3": "jpn", - "iso-639-5": "", + "iso-639-5": None, "english": "Japanese (Japan)", "iso_types": ["name"], "querytype": "languagename", @@ -79,10 +76,10 @@ ( "afa", { - "iso-639-1": "", + "iso-639-1": None, "iso-639-2b": "afa", "iso-639-2t": "afa", - "iso-639-3": "", + "iso-639-3": None, "iso-639-5": "afa", "english": "Afro-Asiatic languages", "iso_types": ["part2b", "part2t", "part5"], @@ -94,10 +91,10 @@ ( "afro-asiatic languages", { - "iso-639-1": "", + "iso-639-1": None, "iso-639-2b": "afa", "iso-639-2t": "afa", - "iso-639-3": "", + "iso-639-3": None, "iso-639-5": "afa", "english": "Afro-Asiatic languages", "iso_types": ["name"], @@ -113,7 +110,7 @@ "iso-639-2b": "chi", "iso-639-2t": "zho", "iso-639-3": "cmn", - "iso-639-5": "", + "iso-639-5": None, "english": "Chinese (Simplified, China)", "iso_types": ["part3"], "querytype": "purecode", @@ -144,7 +141,7 @@ "iso-639-2b": "ara", "iso-639-2t": "ara", "iso-639-3": "arq", - "iso-639-5": "", + "iso-639-5": None, "english": "Arabic (Egypt)", "iso_types": ["part3"], "native": "العربية (مصر)", @@ -159,7 +156,7 @@ "iso-639-2b": "ara", "iso-639-2t": "ara", "iso-639-3": "ara", - "iso-639-5": "", + "iso-639-5": None, "english": "Arabic (Morocco)", "iso_types": ["part1"], "native": "العربية (المغرب)", @@ -169,25 +166,24 @@ ), ], ) -def test_lang_details(query, expected): +def test_lang_details(query: str, expected: dict[str, Any] | None): if expected is None: assert get_language_details(query, failsafe=True) == expected with pytest.raises(NotFoundError): get_language_details(query) else: result = get_language_details(query) - assert result == expected - if result: - assert result.iso_639_1 == expected.get("iso-639-1") - assert result.iso_639_2b == expected.get("iso-639-2b") - assert result.iso_639_2t == expected.get("iso-639-2t") - assert result.iso_639_3 == expected.get("iso-639-3") - assert result.iso_639_5 == expected.get("iso-639-5") - assert result.english == expected.get("english") - assert result.native == expected.get("native") - assert result.iso_types == expected.get("iso_types") - assert result.query == expected.get("query") - assert result.querytype == expected.get("querytype") + assert result + assert result.iso_639_1 == expected.get("iso-639-1") + assert result.iso_639_2b == expected.get("iso-639-2b") + assert result.iso_639_2t == expected.get("iso-639-2t") + assert result.iso_639_3 == expected.get("iso-639-3") + assert result.iso_639_5 == expected.get("iso-639-5") + assert result.english == expected.get("english") + assert result.native == expected.get("native") + assert result.iso_types == expected.get("iso_types") + assert result.query == expected.get("query") + assert result.querytype == expected.get("querytype") @pytest.mark.parametrize( @@ -201,48 +197,76 @@ def test_lang_details(query, expected): ("qq", ("qq", "qq")), ], ) -def test_lang_name(query, expected): +def test_lang_name(query: str, expected: tuple[str, str]): assert find_language_names(query) == expected @pytest.mark.parametrize( - "dict_data", - [{}, {"iso-639-1": "ar"}], + "babel_native_return, babel_english_return, expected_native, expected_english", + [ + ("Native value", "English value", "Native value", "English value"), + (None, "English value", "German", "German"), + ("Native value", None, "German", "German"), + ], ) -def test_lang_equals(dict_data): - assert Lang(dict_data) == Lang(dict_data) - assert Lang(dict_data) == Lang({**dict_data}) +def test_find_language_names( + mocker: Mock, + babel_native_return: str | None, + babel_english_return: str | None, + expected_native: str, + expected_english: str, +): + mock_locale = Mock() + + def mock_display_name(lang: str | None = None) -> str | None: + return babel_native_return if lang is None else babel_english_return + + mock_locale.get_display_name.side_effect = mock_display_name + + mocker.patch("babel.Locale.parse", return_value=mock_locale) + + assert find_language_names("de") == (expected_native, expected_english) @pytest.mark.parametrize( - "dict_data_left, dict_data_right", + "query_left, query_right", [ - ({}, {"iso-639-1": "ar"}), - ({"iso-639-1": "ar"}, {"iso-639-1": "ab"}), - ({"iso-639-1": "ar"}, {"iso-639-2": "ar"}), + pytest.param("ara", "Arabic", id="arabic"), + pytest.param("fra", "French", id="french"), ], ) -def test_lang_not_equals(dict_data_left, dict_data_right): - assert Lang(dict_data_left) != Lang(dict_data_right) - assert Lang(dict_data_left) != "foo" +def test_lang_details_equality(query_left: str, query_right: str): + assert get_language_details(query_left) == get_language_details(query_right) @pytest.mark.parametrize( - "babel_native_return, babel_english_return, expected_native, expected_english", + "patch_attribute", [ - ("Native value", "English value", "Native value", "English value"), - (None, "English value", "German", "German"), - ("Native value", None, "German", "German"), + "iso_639_1", + "iso_639_2b", + "iso_639_2t", + "iso_639_3", + "iso_639_5", + "english", + "native", ], ) -def test_find_language_names( - mocker, babel_native_return, babel_english_return, expected_native, expected_english -): - mock_locale = Mock() - mock_locale.get_display_name.side_effect = lambda lang=None: ( - babel_native_return if lang is None else babel_english_return - ) +def test_lang_details_inequality_with_patch(patch_attribute: str): + lang_and_details_patched = get_language_details("arq") + setattr(lang_and_details_patched, patch_attribute, "foo") + assert get_language_details("arq") != lang_and_details_patched - mocker.patch("babel.Locale.parse", return_value=mock_locale) - assert find_language_names("de") == (expected_native, expected_english) +@pytest.mark.parametrize( + "query_left, query_right", + [ + pytest.param("fra", "ara", id="different_lang"), + pytest.param("ar", "ar-AE", id="different_locale"), + ], +) +def test_lang_details_inequality(query_left: str, query_right: str): + assert get_language_details(query_left) != get_language_details(query_right) + + +def test_lang_details_inequality_objects(): + assert get_language_details("ara") != "ara" From 81c6617917acd08a9c9ad96bf777fc513a1041d9 Mon Sep 17 00:00:00 2001 From: rgaudin Date: Thu, 19 Dec 2024 15:27:43 +0000 Subject: [PATCH 2/3] Simplify i18n API - Single `Language` class that takes a query and handles everything - `get_language()` and `get_language_or_none` as goto calls to get it - kept `find_language_names()` and `is_valid_iso_639_3()` but reusing `Language` --- src/zimscraperlib/i18n.py | 291 ++++++++++++++++++++------------------ tests/i18n/test_i18n.py | 20 +-- 2 files changed, 165 insertions(+), 146 deletions(-) diff --git a/src/zimscraperlib/i18n.py b/src/zimscraperlib/i18n.py index 021fbc82..a5b9429c 100644 --- a/src/zimscraperlib/i18n.py +++ b/src/zimscraperlib/i18n.py @@ -9,171 +9,188 @@ ISO_LEVELS = ["1", "2b", "2t", "3", "5"] -class NotFoundError(ValueError): - pass +class NotFoundError(ValueError): ... + + +class Language: + """Qualified ISO-639-3 language""" + + def __init__(self, query: str): + """Instantiate a valid ISO-639-3 Language from query + + params: either an ISO-639 code or a locale or an english language name""" + self.iso_639_1: str | None = None + self.iso_639_2b: str | None = None + self.iso_639_2t: str | None = None + self.iso_639_3: str | None = None + self.iso_639_5: str | None = None + self.english: str | None = None + self.native: str | None = None + self.iso_types: list[str] = [] + self.query: str = query + self.native_query: str | None = None + self.querytype: str | None = None + + def get_adjusted_query(query: str) -> tuple[str, str, str]: + # possibily an iso-639 code + if query.isalpha() and (2 <= len(query) <= 3): # noqa: PLR2004 + adjusted_query = query + native_query = query + query_type = "purecode" + # possibily a locale + elif all(x.isalpha() or x in ("-", "_") for x in query) and ( + query.count("_") + query.count("-") == 1 + ): + adjusted_query = re.split("-|_", query)[0] + native_query = query.replace("-", "_") + query_type = "locale" + # possibily an ISO language name + else: + adjusted_query = query.title().replace("Languages", "languages") + native_query = query + query_type = "languagename" + return adjusted_query, native_query, query_type + + adjusted_query, self.native_query, self.querytype = get_adjusted_query(query) - -class Lang: - - def __init__(self, requested_lang: str, iso639_lang_obj: iso639.Lang): - self.iso_639_1 = iso639_lang_obj.pt1 or None - self.iso_639_2b = iso639_lang_obj.pt2b or None - self.iso_639_2t = iso639_lang_obj.pt2t or None - self.iso_639_3 = iso639_lang_obj.pt3 or None - self.iso_639_5 = iso639_lang_obj.pt5 or None - self.english = iso639_lang_obj.name or None + try: + isolang = iso639.Lang(adjusted_query) + except ( + iso639.exceptions.InvalidLanguageValue, + iso639.exceptions.DeprecatedLanguageValue, + ) as exc: + raise NotFoundError("Not a valid iso language name/code") from exc + + parts_keys_map = { + "iso_639_1": "pt1", + "iso_639_2b": "pt2b", + "iso_639_2t": "pt2t", + "iso_639_3": "pt3", + "iso_639_5": "pt5", + "english": "name", + } + + self.iso_639_1 = isolang.pt1 or None + self.iso_639_2b = isolang.pt2b or None + self.iso_639_2t = isolang.pt2t or None + self.iso_639_3 = isolang.pt3 or None + self.iso_639_5 = isolang.pt5 or None + self.english = isolang.name or None self.iso_types = [ part_level for iso_level, part_level in [ (f"pt{level}", f"part{level}") for level in ISO_LEVELS ] + [("name", "name")] - if getattr(iso639_lang_obj, iso_level).lower() == requested_lang.lower() + if getattr(isolang, iso_level).lower() == adjusted_query.lower() ] - -class LangAndDetails: - def __init__( - self, lang: Lang, english_name: str, native: str, querytype: str, query: str - ): - self.iso_639_1 = lang.iso_639_1 - self.iso_639_2b = lang.iso_639_2b - self.iso_639_2t = lang.iso_639_2t - self.iso_639_3 = lang.iso_639_3 - self.iso_639_5 = lang.iso_639_5 - self.iso_types = lang.iso_types - self.english = english_name - self.native = native - self.querytype = querytype - self.query = query - - def __eq__(self, value: object) -> bool: - if not isinstance(value, LangAndDetails): - return False - - return ( - self.iso_639_1 == value.iso_639_1 - and self.iso_639_2b == value.iso_639_2b - and self.iso_639_2t == value.iso_639_2t - and self.iso_639_3 == value.iso_639_3 - and self.iso_639_5 == value.iso_639_5 - and self.english == value.english - and self.native == value.native - ) - - def __repr__(self) -> str: - return ( - f"iso_639_1:{self.iso_639_1}, iso_639_2b:{self.iso_639_2b}, " - f"iso_639_2t:{self.iso_639_2t}, iso_639_3:{self.iso_639_3}, " - f"iso_639_5:{self.iso_639_5}, iso_639_5:{self.english}, " - f"iso_639_5:{self.native}" - ) - - -def get_iso_lang_data(lang: str) -> tuple[Lang, Lang | None]: - """ISO-639-x languages details for lang. Raises NotFoundError - - Returns a tuple (main_language, macro_language | None)""" - - try: - isolang = iso639.Lang(lang) - except ( - iso639.exceptions.InvalidLanguageValue, - iso639.exceptions.DeprecatedLanguageValue, - ) as exc: - raise NotFoundError("Not a valid iso language name/code") from exc - - ourlang = Lang(lang, isolang) - - macro = isolang.macro() - - return (ourlang, get_iso_lang_data(macro.name)[0] if macro else None) - - -def find_language_names( - query: str, lang_data: Lang | LangAndDetails | None = None -) -> tuple[str, str]: - """(native, english) language names for lang with help from lang_data - - Falls back to English name if available or query if not""" - if lang_data is None: - lang_data = get_language_details(query, failsafe=True) - if not lang_data: - return query, query - - try: - query_locale = babel.Locale.parse(query) - if native_display_name := query_locale.get_display_name(): - if english_display_name := query_locale.get_display_name("en"): - return native_display_name, english_display_name - except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError): - pass - - # ISO code lookup order matters (most qualified first)! - for iso_level in [f"iso_639_{level}" for level in reversed(ISO_LEVELS)]: + # update if language has a macro + if isolang.macro(): + for iso_level in [f"iso_639_{level}" for level in ISO_LEVELS]: + if not getattr(self, iso_level): + setattr( + self, + iso_level, + # we'll get the pt attr for each iso_xxx + getattr(isolang.macro(), parts_keys_map[iso_level], None) + # we want None if value is empty + or None, + ) + + self.native, self.english = self._get_names_from(self.native_query) + + def _get_names_from(self, query: str) -> tuple[str, str]: + """logic to find language names from babel and fallback""" try: - query_locale = babel.Locale.parse(getattr(lang_data, iso_level)) + query_locale = babel.Locale.parse(query) if native_display_name := query_locale.get_display_name(): if english_display_name := query_locale.get_display_name("en"): return native_display_name, english_display_name except (babel.UnknownLocaleError, TypeError, ValueError, AttributeError): pass - default = lang_data.english or query - return default, default + # ISO code lookup order matters (most qualified first)! + for iso_level in [f"iso_639_{level}" for level in reversed(ISO_LEVELS)]: + try: + query_locale = babel.Locale.parse(getattr(self, iso_level)) + if native_display_name := query_locale.get_display_name(): + if english_display_name := query_locale.get_display_name("en"): + return native_display_name, english_display_name + except ( + babel.UnknownLocaleError, + TypeError, + ValueError, + AttributeError, + ): + pass + default = self.english or query + return default, default + + def todict(self) -> dict[str, str | None | list[str]]: + return { + key.replace("_", "-") if key.startswith("iso") else key: getattr( + self, key, None + ) + for key in [ + "iso_639_1", + "iso_639_2b", + "iso_639_2t", + "iso_639_3", + "iso_639_5", + "english", + "iso_types", + "native", + "querytype", + "query", + ] + } -def update_with_macro(lang_data: Lang, macro_data: Lang | None): - """update empty keys from lang_data with ones of macro_data""" - if not macro_data: - return lang_data + def __repr__(self) -> str: + data_repr = ", ".join( + f'{key.replace("-", "_")}="{value}"' for key, value in self.todict().items() + ) + return f"{type(self).__name__}({data_repr})" - for iso_level in [f"iso_639_{level}" for level in ISO_LEVELS]: - if not getattr(lang_data, iso_level): - setattr(lang_data, iso_level, getattr(macro_data, iso_level)) + def __str__(self) -> str: + return f"{self.iso_639_3}: {self.english}" - return lang_data + def __eq__(self, value: object) -> bool: + return ( + self.iso_639_1 == getattr(value, "iso_639_1", None) + and self.iso_639_2b == getattr(value, "iso_639_2b", None) + and self.iso_639_2t == getattr(value, "iso_639_2t", None) + and self.iso_639_3 == getattr(value, "iso_639_3", None) + and self.iso_639_5 == getattr(value, "iso_639_5", None) + and self.english == getattr(value, "english", None) + and self.native == getattr(value, "native", None) + ) -def get_language_details( - query: str, failsafe: bool | None = False # noqa: FBT002 -) -> LangAndDetails | None: - """language details dict from query. +def find_language_names(query: str) -> tuple[str, str]: + """(native, english) language names for query""" + try: + lang = Language(query) + except NotFoundError: + return query, query + # should be qualified but "None" is as valid as anything if not + return str(lang.native), str(lang.english) - When query fails, either raises NotFoundError or return None, based on failsafe - """ +def get_language(lang_code: str) -> Language: + """Language from lang_code""" + return Language(lang_code) - if query.isalpha() and (2 <= len(query) <= 3): # noqa: PLR2004 - # possibility of iso-639 code - adjusted_query = query - native_query = query - query_type = "purecode" - elif all(x.isalpha() or x in ("-", "_") for x in query) and ( - query.count("_") + query.count("-") == 1 - ): - # possibility of locale - adjusted_query = re.split("-|_", query)[0] - native_query = query.replace("-", "_") - query_type = "locale" - else: - # possibility of iso language name - adjusted_query = query.title().replace("Languages", "languages") - native_query = query - query_type = "languagename" +def get_language_or_none(lang_code: str) -> Language | None: + """Language from lang_code or None if not found""" try: - lang_data, macro_data = get_iso_lang_data(adjusted_query) - except NotFoundError as exc: - if failsafe: - return None - raise exc - - iso_data = update_with_macro(lang_data, macro_data) - native_name, english_name = find_language_names(native_query, iso_data) - return LangAndDetails(iso_data, english_name, native_name, query_type, query) + return get_language(lang_code) + except NotFoundError: + return None def is_valid_iso_639_3(code: str) -> bool: """whether code is a valid ISO-639-3 code""" - lang = get_language_details(code, failsafe=True) + lang = get_language_or_none(code) return lang is not None and lang.iso_639_3 == code diff --git a/tests/i18n/test_i18n.py b/tests/i18n/test_i18n.py index 67353eff..c563819f 100644 --- a/tests/i18n/test_i18n.py +++ b/tests/i18n/test_i18n.py @@ -4,9 +4,11 @@ import pytest from zimscraperlib.i18n import ( + Language, NotFoundError, find_language_names, - get_language_details, + get_language, + get_language_or_none, ) @@ -168,11 +170,11 @@ ) def test_lang_details(query: str, expected: dict[str, Any] | None): if expected is None: - assert get_language_details(query, failsafe=True) == expected + assert get_language_or_none(query) == expected with pytest.raises(NotFoundError): - get_language_details(query) + get_language(query) else: - result = get_language_details(query) + result = get_language_or_none(query) assert result assert result.iso_639_1 == expected.get("iso-639-1") assert result.iso_639_2b == expected.get("iso-639-2b") @@ -236,7 +238,7 @@ def mock_display_name(lang: str | None = None) -> str | None: ], ) def test_lang_details_equality(query_left: str, query_right: str): - assert get_language_details(query_left) == get_language_details(query_right) + assert Language(query_left) == Language(query_right) @pytest.mark.parametrize( @@ -252,9 +254,9 @@ def test_lang_details_equality(query_left: str, query_right: str): ], ) def test_lang_details_inequality_with_patch(patch_attribute: str): - lang_and_details_patched = get_language_details("arq") + lang_and_details_patched = get_language("arq") setattr(lang_and_details_patched, patch_attribute, "foo") - assert get_language_details("arq") != lang_and_details_patched + assert get_language("arq") != lang_and_details_patched @pytest.mark.parametrize( @@ -265,8 +267,8 @@ def test_lang_details_inequality_with_patch(patch_attribute: str): ], ) def test_lang_details_inequality(query_left: str, query_right: str): - assert get_language_details(query_left) != get_language_details(query_right) + assert get_language(query_left) != get_language(query_right) def test_lang_details_inequality_objects(): - assert get_language_details("ara") != "ara" + assert get_language("ara") != "ara" From 67844f45d3fc160a24a4a1ca2e6b18be0c0eaf6c Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 20 Dec 2024 07:42:35 +0000 Subject: [PATCH 3/3] Add more tests for i18n new API --- tests/i18n/test_i18n.py | 102 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/tests/i18n/test_i18n.py b/tests/i18n/test_i18n.py index c563819f..c1a8ed21 100644 --- a/tests/i18n/test_i18n.py +++ b/tests/i18n/test_i18n.py @@ -272,3 +272,105 @@ def test_lang_details_inequality(query_left: str, query_right: str): def test_lang_details_inequality_objects(): assert get_language("ara") != "ara" + + +@pytest.mark.parametrize( + "query,expected", + [ + ( + "fra", + { + "english": "French (France)", + "iso-639-1": "fr", + "iso-639-2b": "fre", + "iso-639-2t": "fra", + "iso-639-3": "fra", + "iso-639-5": None, + "iso-types": [ + "part2t", + "part3", + ], + "native": "français (France)", + "query": "fra", + "querytype": "purecode", + }, + ), + ( + "zh", + { + "english": "Chinese", + "iso-639-1": "zh", + "iso-639-2b": "chi", + "iso-639-2t": "zho", + "iso-639-3": "zho", + "iso-639-5": None, + "iso-types": [ + "part1", + ], + "native": "中文", + "query": "zh", + "querytype": "purecode", + }, + ), + ( + "ar", + { + "english": "Arabic", + "iso-639-1": "ar", + "iso-639-2b": "ara", + "iso-639-2t": "ara", + "iso-639-3": "ara", + "iso-639-5": None, + "iso-types": [ + "part1", + ], + "native": "العربية", + "query": "ar", + "querytype": "purecode", + }, + ), + ], +) +def test_lang_to_dict(query: str, expected: dict[str, str | None | list[str]]): + assert Language(query).todict() == expected + + +@pytest.mark.parametrize( + "query,expected", + [ + ( + "fra", + 'Language(iso_639_1="fr", iso_639_2b="fre", iso_639_2t="fra", ' + 'iso_639_3="fra", iso_639_5="None", english="French (France)", ' + "iso_types=\"['part2t', 'part3']\", native=\"français (France)\", " + 'querytype="purecode", query="fra")', + ), + ( + "zh", + 'Language(iso_639_1="zh", iso_639_2b="chi", iso_639_2t="zho", ' + 'iso_639_3="zho", iso_639_5="None", english="Chinese", ' + 'iso_types="[\'part1\']", native="中文", querytype="purecode", query="zh")', + ), + ( + "ar", + 'Language(iso_639_1="ar", iso_639_2b="ara", iso_639_2t="ara", ' + 'iso_639_3="ara", iso_639_5="None", english="Arabic", ' + 'iso_types="[\'part1\']", native="العربية", querytype="purecode", ' + 'query="ar")', + ), + ], +) +def test_lang_repr(query: str, expected: str): + assert Language(query).__repr__() == expected + + +@pytest.mark.parametrize( + "query,expected", + [ + ("fra", "fra: French (France)"), + ("zh", "zho: Chinese"), + ("ar", "ara: Arabic"), + ], +) +def test_lang_str(query: str, expected: str): + assert f"{Language(query)}" == expected