diff --git a/Cargo.toml b/Cargo.toml index 6140c34..b7738f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,12 @@ exclude = [ ] [dependencies] -language-tags = "0.3.2" +language-tags = { version = "0.3.2", optional = true } + +[features] +bcp47 = ["language-tags"] +posix = [] +default = ["bcp47", "posix"] [profile.dev] # Compilation diff --git a/src/lib.rs b/src/lib.rs index 5b295fd..e05ef5c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,4 +14,8 @@ //! You should have received a copy of the GNU Lesser General Public License //! along with this program. If not, see . -pub mod bcp47; \ No newline at end of file +#[cfg(feature = "bcp47")] +pub mod bcp47; + +#[cfg(feature = "posix")] +pub mod posix; \ No newline at end of file diff --git a/src/posix.rs b/src/posix.rs new file mode 100644 index 0000000..6bfef83 --- /dev/null +++ b/src/posix.rs @@ -0,0 +1,349 @@ +//! locale-match is a small library for matching locales. +//! Copyright (C) © 2024 Petr Alexandrovich Sabanov +//! +//! This program is free software: you can redistribute it and/or modify +//! it under the terms of the GNU Lesser General Public License as published by +//! the Free Software Foundation, either version 3 of the License, or +//! (at your option) any later version. +//! +//! This program is distributed in the hope that it will be useful, +//! but WITHOUT ANY WARRANTY; without even the implied warranty of +//! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +//! GNU Lesser General Public License for more details. +//! +//! You should have received a copy of the GNU Lesser General Public License +//! along with this program. If not, see . + +/// Finds the best matching locale from a list of available locales based on a list of user locales. +/// The function expects locales to be valid POSIX locales, but does not validate them. +/// The function expects locales to be encoded with ASCII. +/// +/// The function compares user locales to available locales to find the best match. +/// For each user locale, it iterates through the available locales and, for those with a matching +/// primary language, calculates a score based on how closely each available locale matches the user +/// locale. +/// The score calculation gives higher priority to matching more significant parts of the locale +/// (i.e., earlier segments in the locale string). +/// If a subtag is empty, it is considered to match equally well with any subtag from the same +/// category. +/// +/// If multiple available locales have the same score, the function selects the one that appears +/// earlier in the list of available locales. +/// If no available locale matches the primary language of a user locale, the function moves to the +/// next user locale in the list. +/// If no matches are found for any user locale, the function returns [`None`]. +/// +/// Malformed locales are ignored. +/// +/// # Arguments +/// +/// * `available_locales` - An iterator over locale strings representing the available locales. +/// These locales should be ordered by priority, meaning that a locale appearing earlier in this +/// list is considered more preferable for the program. +/// * `user_locales` - An iterator over locale strings representing the user locales to match +/// against. These locales should also be ordered by priority, meaning that a locale appearing +/// earlier in this list is considered more desirable for the user. +/// +/// # Returns +/// +/// Returns an [`Option`] containing the string representation of the best matching locale. +/// If multiple available locales match the same user locale with equal score, the one that appears +/// earlier in the list of available locales is chosen. +/// If no match is found, [`None`] is returned. +/// +/// The returned locale is guaranteed to EXACTLY match one of the available locales. +/// For example, `best_matching_locale(&["EN"].iter(), &["en"].iter())` will return `Some("EN")`. +/// +/// # Examples +/// +/// ``` +/// use locale_match::posix::best_matching_locale; +/// +/// +/// let available_locales = vec!["en_US", "en_GB", "ru_UA", "fr_FR", "it"]; +/// let user_locales = vec!["ru_RU", "ru", "en_US", "en"]; +/// +/// let best_match = best_matching_locale(available_locales.iter(), user_locales.iter()); +/// +/// // "ru_UA" is the best match for the highest-priority user locale "ru_RU" +/// assert_eq!(best_match, Some("ru_UA")); +/// +/// +/// let available_locales = vec!["en", "pt_BR", "pt_PT", "es"]; +/// let user_locales = vec!["pt", "en"]; +/// +/// let best_match = best_matching_locale(available_locales.iter(), user_locales.iter()); +/// +/// // "pt_BR" is the first best match for the highest-priority user locale "pt" +/// assert_eq!(best_match, Some("pt_BR")); +/// +/// +/// let available_locales = vec!["fr", "fr_FR", "fr_CA.UTF-8"]; +/// let user_locales = vec!["fr.UTF-8"]; +/// +/// let best_match = best_matching_locale(available_locales.iter(), user_locales.iter()); +/// +/// // Empty territory in "fr.UTF-8" matches any territory, e.g. "CA" +/// assert_eq!(best_match, Some("fr_CA.UTF-8")); +/// ``` +pub fn best_matching_locale<'a, 'b, T1, T2>(available_locales: impl Iterator, user_locales: impl Iterator) -> Option<&'a str> +where + T1: AsRef + 'a, + T2: AsRef + 'b +{ + let available_parsed_locales = available_locales + .map(|l| PosixLocale::parse(l.as_ref())) + .collect::>(); + + user_locales + .map(|locale| PosixLocale::parse(locale.as_ref())) + .find_map(|user_locale| + available_parsed_locales.iter() + .rev() // For max_by_key to return the first locale with max score + .filter(|aval_locale| aval_locale.language.eq_ignore_ascii_case(user_locale.language)) + .max_by_key(|aval_locale| { + let mut score = 0; + for (aval, user, weight) in [ + (aval_locale.territory, user_locale.territory, 4), + (aval_locale.codeset, user_locale.codeset, 2), + (aval_locale.modifier, user_locale.modifier, 1), + ] { + match (aval, user) { + (Some(a), Some(u)) if a.eq_ignore_ascii_case(u) => score += weight, + _ => {} // Ignore if both are None + } + } + score + }) + ) + .map(|aval_locale| aval_locale.locale) +} + +/// A POSIX locale as described in [The Open Group Base Specifications Issue 8 - 8. Environment Variables](https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap08.html). +struct PosixLocale<'a> { + locale: &'a str, + language: &'a str, + territory: Option<&'a str>, + codeset: Option<&'a str>, + modifier: Option<&'a str>, +} + +impl<'a> PosixLocale<'a> { + const TERRITORY_DELIMITER: char = '_'; + const CODESET_DELIMITER: char = '.'; + const MODIFIER_DELIMITER: char = '@'; + + /// Parse a POSIX locale string into a `PosixLocale`. + /// + /// The `locale` string should be in the form `language[_territory][.codeset][@modifier]`. + fn parse(locale: &'a str) -> Self { + let codeset_end = locale.find(Self::MODIFIER_DELIMITER).unwrap_or(locale.len()); + let territory_end = locale.find(Self::CODESET_DELIMITER).unwrap_or(codeset_end); + let language_end = locale.find(Self::TERRITORY_DELIMITER).unwrap_or(territory_end); + Self { + locale, + language: &locale[..language_end], + territory: locale.get(language_end + 1..territory_end), + codeset: locale.get(territory_end + 1..codeset_end), + modifier: locale.get(codeset_end + 1..) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_best_matching_locale() { + + fn assert_best_match(available_locales: &[&str], user_locales: &[&str], expected: Option<&str>) { + assert_eq!(best_matching_locale(available_locales.iter(), user_locales.iter()).as_deref(), expected); + } + + // One best match + assert_best_match(&["en_US", "ru_RU"], &["ru", "en"], Some("ru_RU")); + assert_best_match(&["en_US", "ru_RU"], &["en", "ru"], Some("en_US")); + assert_best_match(&["en_US", "en_GB", "ru_UA", "fr_FR", "it"], &["ru_RU", "ru", "en_US", "en"], Some("ru_UA")); + assert_best_match(&["ru_RU", "sq_AL", "eu_ES"], &["en_US", "en", "sq_XK", "sq"], Some("sq_AL")); + assert_best_match(&["lv_LV", "ru_RU", "lt_LT", "mn_MN", "ku_TR"], &["fr", "fr_FR", "ml", "si", "id", "ku_IQ"], Some("ku_TR")); + assert_best_match(&["st_LS", "sn_ZW", "en_US"], &["zu_ZA", "st_ZA", "en"], Some("st_LS")); + + // Multiple best matches + assert_best_match(&["en_US", "en_GB", "ru_UA", "fr_FR", "it"], &["en_US", "en", "ru_RU", "ru"], Some("en_US")); + assert_best_match(&["en", "pt_BR", "pt_PT", "es"], &["pt", "en"], Some("pt_BR")); + assert_best_match(&["ku_TR", "ku_IQ", "ku_IR"], &["ku", "en"], Some("ku_TR")); + assert_best_match(&["en_US", "ru_RU", "mn_CN", "sn_ZW", "en", "ru", "mn_MN", "sn"], &["mn", "ru", "en", "sn"], Some("mn_CN")); + + // Identical + assert_best_match(&["en"], &["en"], Some("en")); + assert_best_match(&["en_US"], &["en_US"], Some("en_US")); + assert_best_match(&["en_US", "ru_RU"], &["en_US", "ru_RU"], Some("en_US")); + assert_best_match(&["st_LS", "sn_ZW", "en_US"], &["st_LS", "sn_ZW", "en_US"], Some("st_LS")); + assert_best_match(&["ku_TR", "ku_IQ", "ku_IR"], &["ku_TR", "ku_IQ", "ku_IR"], Some("ku_TR")); + assert_best_match(&["lv_LV", "ru_RU", "lt_LT", "mn_MN", "ku_TR"], &["lv_LV", "ru_RU", "lt_LT", "mn_MN", "ku_TR"], Some("lv_LV")); + + // More complicated cases + assert_best_match(&["en_US", "ru_RU.UTF-8"], &["ru", "en"], Some("ru_RU.UTF-8")); + assert_best_match(&["en_US", "ru.UTF-8", "ru_RU.UTF-8"], &["ru.UTF-8", "en"], Some("ru.UTF-8")); + assert_best_match(&["en_US", "ru_RU.UTF-8", "ru.UTF-8"], &["ru.UTF-8", "en"], Some("ru_RU.UTF-8")); + assert_best_match(&["en_US", "ru.UTF-8@dict", "ru_UA"], &["ru_UA.UTF-8@dict", "en"], Some("ru_UA")); + assert_best_match(&["en_US@dict", "ru_RU"], &["en", "ru"], Some("en_US@dict")); + assert_best_match(&["en_US.CP1252", "en_GB.UTF-8", "ru_UA@icase", "fr_FR@euro", "it.UTF-8"], &["ru_RU.KOI8-R", "ru@icase", "en_US.UTF-8", "en.CP1252"], Some("ru_UA@icase")); + assert_best_match(&["fr", "fr_FR", "fr_CA.UTF-8"], &["fr.UTF-8"], Some("fr_CA.UTF-8")); + assert_best_match(&["en", "pt_BR@dict", "pt_PT@icase", "es"], &["pt.CP1252@euro", "en.UTF-8@dict"], Some("pt_BR@dict")); + assert_best_match(&["en_US", "ru_RU", "mn_CN.UTF-8", "sn_ZW", "en", "ru", "mn_MN@dict", "sn"], &["mn.UTF-8@dict", "ru", "en", "sn"], Some("mn_CN.UTF-8")); + + // One available locale + assert_best_match(&["kk"], &["en", "en_US", "fr_FR", "fr", "it", "pt", "ru_RU", "es_ES", "kk_KZ"], Some("kk")); + + // One user locale + assert_best_match(&["en", "en_US", "fr_FR", "fr", "it", "pt", "ru_RU", "es_ES", "kk_KZ", "pt"], &["pt_PT"], Some("pt")); + + // Not found + assert_best_match(&["en", "en_US", "fr_FR", "fr", "it", "pt", "es_ES", "kk_KZ", "pt"], &["ru"], None); + assert_best_match(&["en", "en_US", "fr_FR", "fr", "pt"], &["id"], None); + assert_best_match(&["ru", "be", "uk", "kk"], &["en"], None); + + // Empty available locales + assert_best_match(&[], &["en", "fr", "it", "pt"], None); + + // Empty user locales + assert_best_match(&["en", "fr", "it", "pt"], &[], None); + + // Both lists empty + assert_best_match(&[], &[], None); + + // Malformed + assert_best_match(&[" en"], &["en"], None); + assert_best_match(&["?ru"], &["ru"], None); + assert_best_match(&["ruRU"], &["ru"], None); + + // Repeating + assert_best_match(&["en", "en", "en", "en"], &["ru_RU", "ru", "en_US", "en"], Some("en")); + assert_best_match(&["en_US", "en_GB", "ru_UA", "fr_FR", "it"], &["kk", "ru", "pt", "ru"], Some("ru_UA")); + + // Littered + assert_best_match(&["!!!!!!", "qwydgn12i6i", "ЖЖяяЖяЬЬЬ", "en_US", "!*&^^&*", "qweqweqweqwe_qweqwe", "ru_RU", "@@", "@"], &["ru", "en"], Some("ru_RU")); + assert_best_match(&["", "", "", "zh", "", "", "", "", "", "he", "", ""], &["he", "", "", "zh"], Some("he")); + + // Special characters + assert_best_match(&["sq\0", "ru_RU", "sq_AL", "eu_ES"], &["en_US", "en", "sq_XK", "sq"], Some("sq_AL")); + assert_best_match(&["\0", "\x01\x02\x03\x04", "sq\0", "ru_RU", "sq_AL", "eu_ES"], &["en_US", "\x06", "en", "sq_XK", "sq", "\0"], Some("sq_AL")); + + // Various letter cases + assert_best_match(&["EN"], &["en"], Some("EN")); + assert_best_match(&["En"], &["EN"], Some("En")); + assert_best_match(&["Ru_rU"], &["en", "ru"], Some("Ru_rU")); + assert_best_match(&["rU_rU"], &["en", "Ru"], Some("rU_rU")); + assert_best_match(&["EN.Utf-8"], &["en.UTF-8"], Some("EN.Utf-8")); + assert_best_match(&["En@dIcT"], &["EN_us"], Some("En@dIcT")); + assert_best_match(&["ru_ru.utf-8@icase"], &["en", "RU_RU.UTF-8@ICASE"], Some("ru_ru.utf-8@icase")); + assert_best_match(&["fr_FR.CP1252@euRO"], &["FR", "en"], Some("fr_FR.CP1252@euRO")); + } + + #[test] + #[allow(non_snake_case)] + fn test_PosixLocale() { + + fn assert_parts(locale: &str, parts: (&str, Option<&str>, Option<&str>, Option<&str>)) { + let posix_locale = PosixLocale::parse(locale); + assert_eq!(posix_locale.locale, locale); + assert_eq!(posix_locale.language, parts.0); + assert_eq!(posix_locale.territory, parts.1); + assert_eq!(posix_locale.codeset, parts.2); + assert_eq!(posix_locale.modifier, parts.3); + } + + // Language only + assert_parts("en", ("en", None, None, None)); + assert_parts("ru", ("ru", None, None, None)); + assert_parts("fr", ("fr", None, None, None)); + + // Language and territory + assert_parts("en_US", ("en", Some("US"), None, None)); + assert_parts("ru_RU", ("ru", Some("RU"), None, None)); + assert_parts("fr_FR", ("fr", Some("FR"), None, None)); + + // Language and codeset + assert_parts("en.UTF-8", ("en", None, Some("UTF-8"), None)); + assert_parts("ru.KOI8-R", ("ru", None, Some("KOI8-R"), None)); + assert_parts("fr.CP1252", ("fr", None, Some("CP1252"), None)); + + // Language and modifier + assert_parts("en@dict", ("en", None, None, Some("dict"))); + assert_parts("ru@icase", ("ru", None, None, Some("icase"))); + assert_parts("fr@euro", ("fr", None, None, Some("euro"))); + + // Language, territory and codeset + assert_parts("en_US.UTF-8", ("en", Some("US"), Some("UTF-8"), None)); + assert_parts("ru_RU.KOI8-R", ("ru", Some("RU"), Some("KOI8-R"), None)); + assert_parts("fr_FR.CP1252", ("fr", Some("FR"), Some("CP1252"), None)); + + // Language, territory and modifier + assert_parts("en_US@dict", ("en", Some("US"), None, Some("dict"))); + assert_parts("ru_RU@icase", ("ru", Some("RU"), None, Some("icase"))); + assert_parts("fr_FR@euro", ("fr", Some("FR"), None, Some("euro"))); + + // Language, codeset and modifier + assert_parts("en.UTF-8@dict", ("en", None, Some("UTF-8"), Some("dict"))); + assert_parts("ru.KOI8-R@icase", ("ru", None, Some("KOI8-R"), Some("icase"))); + assert_parts("fr.CP1252@euro", ("fr", None, Some("CP1252"), Some("euro"))); + + // Language, territory, codeset and modifier + assert_parts("en_US.UTF-8@dict", ("en", Some("US"), Some("UTF-8"), Some("dict"))); + assert_parts("ru_RU.KOI8-R@icase", ("ru", Some("RU"), Some("KOI8-R"), Some("icase"))); + assert_parts("fr_FR.CP1252@euro", ("fr", Some("FR"), Some("CP1252"), Some("euro"))); + + // Various letter cases + assert_parts("EN", ("EN", None, None, None)); + assert_parts("Ru", ("Ru", None, None, None)); + assert_parts("fR", ("fR", None, None, None)); + assert_parts("eN_us.Utf-8", ("eN", Some("us"), Some("Utf-8"), None)); + assert_parts("RU_ru.koi8-R", ("RU", Some("ru"), Some("koi8-R"), None)); + assert_parts("Fr_Fr.Cp1252", ("Fr", Some("Fr"), Some("Cp1252"), None)); + assert_parts("en_us.utf-8@DICT", ("en", Some("us"), Some("utf-8"), Some("DICT"))); + assert_parts("RU_RU.KOI8-R@Icase", ("RU", Some("RU"), Some("KOI8-R"), Some("Icase"))); + assert_parts("fR_fR.cP1252@eUrO", ("fR", Some("fR"), Some("cP1252"), Some("eUrO"))); + + // Empty + assert_parts("", ("", None, None, None)); + + // Whitespace + assert_parts(" ", (" ", None, None, None)); + assert_parts(" ", (" ", None, None, None)); + assert_parts("\t", ("\t", None, None, None)); + assert_parts("\n", ("\n", None, None, None)); + assert_parts("\n \t\t\n \n\t \t\t\n\n\t", ("\n \t\t\n \n\t \t\t\n\n\t", None, None, None)); + + // Litter + assert_parts("!!!", ("!!!", None, None, None)); + assert_parts("12345", ("12345", None, None, None)); + assert_parts("+-+-", ("+-+-", None, None, None)); + + // Malformed + assert_parts("!!!_9999.UUU@()()", ("!!!", Some("9999"), Some("UUU"), Some("()()"))); + assert_parts("12_123.1234@12345", ("12", Some("123"), Some("1234"), Some("12345"))); + assert_parts("+-+-@+-+-", ("+-+-", None, None, Some("+-+-"))); + + // Wrong order EXPECTED TO BE BROKEN + assert_parts("lang.codeset_region@modifier", ("lang.codeset", None, Some("codeset_region"), Some("modifier"))); + assert_parts("lang@modifier.codeset_region", ("lang@modifier.codeset", None, None, Some("modifier.codeset_region"))); + assert_parts("lang_region@modifier.codeset", ("lang", Some("region@modifier"), None, Some("modifier.codeset"))); + assert_parts("lang.codeset@modifier_region", ("lang.codeset@modifier", None, Some("codeset"), Some("modifier_region"))); + assert_parts("lang@modifier_region.codeset", ("lang@modifier", Some("region"), None, Some("modifier_region.codeset"))); + + // Parts missing + assert_parts("_.@", ("", Some(""), Some(""), Some(""))); + assert_parts("_US.UTF-8@dict", ("", Some("US"), Some("UTF-8"), Some("dict"))); + assert_parts("ru_.KOI8-R@icase", ("ru", Some(""), Some("KOI8-R"), Some("icase"))); + assert_parts("fr_FR.@euro", ("fr", Some("FR"), Some(""), Some("euro"))); + assert_parts("de_DE.ISO-8859-1@", ("de", Some("DE"), Some("ISO-8859-1"), Some(""))); + + // Special characters + assert_parts("\0", ("\0", None, None, None)); + assert_parts("\0_\0.\0@\0", ("\0", Some("\0"), Some("\0"), Some("\0"))); + assert_parts("\0\x01\x02\x03", ("\0\x01\x02\x03", None, None, None)); + assert_parts("\x03\x02\x01", ("\x03\x02\x01", None, None, None)); + } +} \ No newline at end of file