projectfluent · raphlinus · Apr 12, 2019
diff --git a/Cargo.toml b/Cargo.toml
@@ -17,6 +17,9 @@ coveralls = { repository = "projectfluent/fluent-locale-rs", branch = "master",
 
 maintenance = { status = "actively-developed" }
 
+[features]
+likely-subtags = []
+
 [dependencies]
 
 [dev-dependencies]

diff --git a/scripts/gen_tables.py b/scripts/gen_tables.py
@@ -0,0 +1,97 @@
+import sys
+import re
+import codecs
+
+import xml.etree.ElementTree as ET
+
+def gen_tables(filename, outfile):
+    o = codecs.open(outfile, 'w', 'utf-8')
+    tree = ET.parse(filename)
+    root = tree.getroot()
+    assert(root.tag == 'supplementalData')
+    entries = []
+    for child in root:
+        if child.tag == 'likelySubtags':
+            for subtag in child:
+                fr, to = subtag.attrib['from'], subtag.attrib['to']
+                if len(to.split('_')) != 3:
+                    print('Unexpected "to" string: ' + to)
+                    exit(1)
+                entries.append((fr, to.replace('_', '-')))
+    lang_re = re.compile('([a-z]+)$')
+    lang_only = []
+    lang_reg_re = re.compile('([a-z]+)_([A-Z]+|[0-9]+)$')
+    lang_reg = []
+    lang_script_re = re.compile('([a-z]+)_([A-Z][a-z]+)$')
+    lang_script = []
+    script_region_re = re.compile('und_([A-Z][a-z]+)_([A-Z]+|[0-9]+)$')
+    script_region = []
+    for (fr, to) in entries:
+        m = lang_re.match(fr)
+        if m:
+            lang = m.group(1)
+            if lang == 'und': lang = ''
+            lang_only.append((tag_to_hex(lang), to))
+            continue
+        m = lang_reg_re.match(fr)
+        if m:
+            lang = m.group(1)
+            region = m.group(2)
+            if lang == 'und': lang = ''
+            tag = tag_to_hex(lang) | (tag_to_hex(region) << 32)
+            lang_reg.append((tag, to))
+            continue
+        m = lang_script_re.match(fr)
+        if m:
+            lang = m.group(1)
+            script = m.group(2)
+            if lang == 'und': lang = ''
+            tag = tag_to_hex(lang) | (tag_to_hex(script) << 32)
+            lang_script.append((tag, to))
+            continue
+        m = script_region_re.match(fr)
+        if m:
+            script = m.group(1)
+            region = m.group(2)
+            tag = tag_to_hex(script) | (tag_to_hex(region) << 32)
+            script_region.append((tag, to))
+            continue
+        print('Unexpected "from" string: ' + fr)
+        exit(1)
+    o.write("""// This file was automatically generated by gen_tables.py.
+// It is derived from likelySubtags.xml from CLDR (http://cldr.unicode.org/)
+
+// That file contains the following copyright notice:
+
+// Copyright © 1991-2018 Unicode, Inc.
+// For terms of use, see http://www.unicode.org/copyright.html
+// Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
+
+""")
+    print_table('LANG_ONLY', "(u32, &'static str)", lang_only, o)
+    print_table('LANG_REGION', "(u64, &'static str)", lang_reg, o)
+    print_table('LANG_SCRIPT', "(u64, &'static str)", lang_script, o)
+    print_table('SCRIPT_REGION', "(u64, &'static str)", script_region, o)
+
+def print_table(name, ty, data, o):
+    data.sort()
+    o.write('pub const ' + name + ': [' + ty + '; ' + str(len(data)) + '] = [\n')
+    for fr, to in data:
+        o.write('    (' + hex(fr) + ', "' + to + '"),\n')
+    o.write('];\n')
+
+def main(args):
+    if len(args) < 2:
+        print("Usage: python3 gen_tables.py likelySubtags.xml table.rs")
+        exit(1)
+    filename = args[1]
+    outfile = args[2]
+    gen_tables(filename, outfile)
+
+def tag_to_hex(tag):
+    result = 0
+    for i, c in enumerate(tag):
+        result += ord(c) << (i * 8)
+    return result
+
+main(sys.argv)
diff --git a/src/locale/likely_subtags.rs b/src/locale/likely_subtags.rs
@@ -0,0 +1,151 @@
+#[path = "tables.rs"]
+mod tables;
+
+use super::Locale;
+
+/// Get little-endian numeric representation of string
+fn str_to_le(s: &str) -> u32 {
+    if s.len() > 4 {
+        !0
+    } else {
+        let mut buf = [0; 4];
+        buf[0..s.len()].copy_from_slice(s.as_bytes());
+        u32::from_le_bytes(buf)
+    }
+}
+
+fn lookup_binary<K: Ord, T: Copy>(key: K, data: &[(K, T)]) -> Option<T> {
+    data.binary_search_by(|(k, _)| k.cmp(&key)).ok().map(|i| data[i].1)
+}
+
+/// Apply a lookup result to the locale.
+///
+/// Assumes lookup is of form "lang-Scrp-REG".
+fn apply_lookup(locale: &mut Locale, lookup: &str) {
+    let bytes = lookup.as_bytes();
+    let end_lang = bytes.iter().position(|&b| b == b'-').unwrap();
+    let start_script = end_lang + 1;
+    let end_script = start_script + bytes[start_script..].iter().position(|&b| b == b'-').unwrap();
+    let start_region = end_script + 1;
+    if locale.get_language().is_empty() {
+        let _ = locale.set_language(&lookup[..end_lang]);
+    }
+    if locale.get_script().is_empty() {
+        let _ = locale.set_script(&lookup[start_script..end_script]);
+    }
+    if locale.get_region().is_empty() {
+        let _ = locale.set_region(&lookup[start_region..]);
+    }
+}
+
+const SCRIPT_ZZZZ_TAG: u32 = 0x7a7a7a5a; // "Zzzz" in little-endian
+const REGION_ZZ_TAG: u32 = 0x5a5a; // "ZZ" in little-endian
+
+/// Add likely subtags to locale.
+///
+/// Returns `true` when the lookup succeeded.
+///
+/// See <http://www.unicode.org/reports/tr35/#Likely_Subtags>
+impl Locale {
+    pub fn add_likely_subtags(&mut self) -> bool {
+        // Canonicalize.
+        // TODO: replace deprecated subtags.
+        // TODO: bail on grandfathered tag.
+        let lang = str_to_le(self.get_language());
+        let script = str_to_le(self.get_script());
+        let region = str_to_le(self.get_region());
+        if script == SCRIPT_ZZZZ_TAG {
+            let _ = self.set_script("");
+        }
+        if region == REGION_ZZ_TAG {
+            let _ = self.set_region("");
+        }
+        // Lookup.
+        if lang == 0 {
+            let key = (region as u64) << 32 | (script as u64);
+            if let Some(lookup) = lookup_binary(key, &tables::SCRIPT_REGION) {
+                apply_lookup(self, lookup);
+                return true;
+            }
+        }
+        let key = (region as u64) << 32 | (lang as u64);
+        if let Some(lookup) = lookup_binary(key, &tables::LANG_REGION) {
+            apply_lookup(self, lookup);
+            return true;
+        }
+        let key = (script as u64) << 32 | (lang as u64);
+        if let Some(lookup) = lookup_binary(key, &tables::LANG_SCRIPT) {
+            apply_lookup(self, lookup);
+            return true;
+        }
+        if let Some(lookup) = lookup_binary(lang, &tables::LANG_ONLY) {
+            apply_lookup(self, lookup);
+            return true;
+        }
+        let key = (script as u64) << 32; // und-script
+        if let Some(lookup) = lookup_binary(key, &tables::LANG_SCRIPT) {
+            apply_lookup(self, lookup);
+            return true;
+        }
+        false
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::Locale;
+
+    fn a(fr: &str, to: &str) {
+        let mut loc = Locale::from(fr);
+        loc.add_likely_subtags();
+        assert_eq!(loc.to_string(), to);
+    }
+
+    #[test]
+    fn add_likely() {
+        a("en-Zzzz-US", "en-Latn-US");
+        a("en-ZZ", "en-Latn-US");
+        a("und-Arab-CC", "ms-Arab-CC");
+        a("und-Hebr-GB", "yi-Hebr-GB");
+        a("yi-GB", "yi-Hebr-GB");
+        a("az", "az-Latn-AZ");
+        a("az-IS", "az-Latn-IS");
+        a("az-IQ", "az-Arab-IQ");
+        a("az-RU", "az-Cyrl-RU");
+        a("az-Arab", "az-Arab-IR");
+        a("zh-CN", "zh-Hans-CN");
+        a("zh-HK", "zh-Hant-HK");
+        a("und-Adlm", "ff-Adlm-GN");
+        a("und-Adlm-IS", "is-Adlm-IS");
+        a("und-Adlm-IO", "ff-Adlm-IO");
+        a("und-CN", "zh-Hans-CN");
+        a("en-Shaw", "en-Shaw-GB");
+        a("ZH-ZZZZ-SG", "zh-Hans-SG"); // example from spec
+    }
+
+    #[test]
+    fn add_likely_consistent_with_shortcuts() {
+        // Make sure this logic is consistent with the shortcuts in the negotiate module.
+        a("az", "az-Latn-AZ");
+        a("bg", "bg-Cyrl-BG");
+        a("cs", "cs-Latn-CZ");
+        a("de", "de-Latn-DE");
+        a("en", "en-Latn-US");
+        a("es", "es-Latn-ES");
+        a("fi", "fi-Latn-FI");
+        a("fr", "fr-Latn-FR");
+        a("hu", "hu-Latn-HU");
+        a("it", "it-Latn-IT");
+        a("lt", "lt-Latn-LT");
+        a("lv", "lv-Latn-LV");
+        a("nl", "nl-Latn-NL");
+        a("pl", "pl-Latn-PL");
+        a("ro", "ro-Latn-RO");
+        a("ru", "ru-Cyrl-RU");
+        a("sr", "sr-Cyrl-RS");
+        a("sr-RU", "sr-Latn-RU");
+        a("az-IR", "az-Arab-IR");
+        a("zh-GB", "zh-Hant-GB");
+        a("zh-US", "zh-Hant-US");
+    }
+}
diff --git a/src/locale/mod.rs b/src/locale/mod.rs
@@ -4,6 +4,9 @@ use std::fmt;
 mod options;
 mod parser;
 
+#[cfg(feature = "likely-subtags")]
+mod likely_subtags;
+
 /// A Locale object.
 ///
 /// Locale object stores information encoded in a language tag and provides