Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add implementation of likely subtags #11

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ coveralls = { repository = "projectfluent/fluent-locale-rs", branch = "master",

maintenance = { status = "actively-developed" }

[features]
likely-subtags = []

[dependencies]

[dev-dependencies]
Expand Down
97 changes: 97 additions & 0 deletions scripts/gen_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import sys
import re
import codecs

import xml.etree.ElementTree as ET

def gen_tables(filename, outfile):
o = codecs.open(outfile, 'w', 'utf-8')
tree = ET.parse(filename)
root = tree.getroot()
assert(root.tag == 'supplementalData')
entries = []
for child in root:
if child.tag == 'likelySubtags':
for subtag in child:
fr, to = subtag.attrib['from'], subtag.attrib['to']
if len(to.split('_')) != 3:
print('Unexpected "to" string: ' + to)
exit(1)
entries.append((fr, to.replace('_', '-')))
lang_re = re.compile('([a-z]+)$')
lang_only = []
lang_reg_re = re.compile('([a-z]+)_([A-Z]+|[0-9]+)$')
lang_reg = []
lang_script_re = re.compile('([a-z]+)_([A-Z][a-z]+)$')
lang_script = []
script_region_re = re.compile('und_([A-Z][a-z]+)_([A-Z]+|[0-9]+)$')
script_region = []
for (fr, to) in entries:
m = lang_re.match(fr)
if m:
lang = m.group(1)
if lang == 'und': lang = ''
lang_only.append((tag_to_hex(lang), to))
continue
m = lang_reg_re.match(fr)
if m:
lang = m.group(1)
region = m.group(2)
if lang == 'und': lang = ''
tag = tag_to_hex(lang) | (tag_to_hex(region) << 32)
lang_reg.append((tag, to))
continue
m = lang_script_re.match(fr)
if m:
lang = m.group(1)
script = m.group(2)
if lang == 'und': lang = ''
tag = tag_to_hex(lang) | (tag_to_hex(script) << 32)
lang_script.append((tag, to))
continue
m = script_region_re.match(fr)
if m:
script = m.group(1)
region = m.group(2)
tag = tag_to_hex(script) | (tag_to_hex(region) << 32)
script_region.append((tag, to))
continue
print('Unexpected "from" string: ' + fr)
exit(1)
o.write("""// This file was automatically generated by gen_tables.py.
// It is derived from likelySubtags.xml from CLDR (http://cldr.unicode.org/)

// That file contains the following copyright notice:

// Copyright © 1991-2018 Unicode, Inc.
// For terms of use, see http://www.unicode.org/copyright.html
// Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.

""")
print_table('LANG_ONLY', "(u32, &'static str)", lang_only, o)
print_table('LANG_REGION', "(u64, &'static str)", lang_reg, o)
print_table('LANG_SCRIPT', "(u64, &'static str)", lang_script, o)
print_table('SCRIPT_REGION', "(u64, &'static str)", script_region, o)

def print_table(name, ty, data, o):
data.sort()
o.write('pub const ' + name + ': [' + ty + '; ' + str(len(data)) + '] = [\n')
for fr, to in data:
o.write(' (' + hex(fr) + ', "' + to + '"),\n')
o.write('];\n')

def main(args):
if len(args) < 2:
print("Usage: python3 gen_tables.py likelySubtags.xml table.rs")
exit(1)
filename = args[1]
outfile = args[2]
gen_tables(filename, outfile)

def tag_to_hex(tag):
result = 0
for i, c in enumerate(tag):
result += ord(c) << (i * 8)
return result

main(sys.argv)
151 changes: 151 additions & 0 deletions src/locale/likely_subtags.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#[path = "tables.rs"]
mod tables;

use super::Locale;

/// Get little-endian numeric representation of string
fn str_to_le(s: &str) -> u32 {
if s.len() > 4 {
!0
} else {
let mut buf = [0; 4];
buf[0..s.len()].copy_from_slice(s.as_bytes());
u32::from_le_bytes(buf)
}
}

fn lookup_binary<K: Ord, T: Copy>(key: K, data: &[(K, T)]) -> Option<T> {
data.binary_search_by(|(k, _)| k.cmp(&key)).ok().map(|i| data[i].1)
}

/// Apply a lookup result to the locale.
///
/// Assumes lookup is of form "lang-Scrp-REG".
fn apply_lookup(locale: &mut Locale, lookup: &str) {
let bytes = lookup.as_bytes();
let end_lang = bytes.iter().position(|&b| b == b'-').unwrap();
let start_script = end_lang + 1;
let end_script = start_script + bytes[start_script..].iter().position(|&b| b == b'-').unwrap();
let start_region = end_script + 1;
if locale.get_language().is_empty() {
let _ = locale.set_language(&lookup[..end_lang]);
}
if locale.get_script().is_empty() {
let _ = locale.set_script(&lookup[start_script..end_script]);
}
if locale.get_region().is_empty() {
let _ = locale.set_region(&lookup[start_region..]);
}
}

const SCRIPT_ZZZZ_TAG: u32 = 0x7a7a7a5a; // "Zzzz" in little-endian
const REGION_ZZ_TAG: u32 = 0x5a5a; // "ZZ" in little-endian

/// Add likely subtags to locale.
///
/// Returns `true` when the lookup succeeded.
///
/// See <http://www.unicode.org/reports/tr35/#Likely_Subtags>
impl Locale {
pub fn add_likely_subtags(&mut self) -> bool {
// Canonicalize.
// TODO: replace deprecated subtags.
// TODO: bail on grandfathered tag.
let lang = str_to_le(self.get_language());
let script = str_to_le(self.get_script());
let region = str_to_le(self.get_region());
if script == SCRIPT_ZZZZ_TAG {
let _ = self.set_script("");
}
if region == REGION_ZZ_TAG {
let _ = self.set_region("");
}
// Lookup.
if lang == 0 {
let key = (region as u64) << 32 | (script as u64);
if let Some(lookup) = lookup_binary(key, &tables::SCRIPT_REGION) {
apply_lookup(self, lookup);
return true;
}
}
let key = (region as u64) << 32 | (lang as u64);
if let Some(lookup) = lookup_binary(key, &tables::LANG_REGION) {
apply_lookup(self, lookup);
return true;
}
let key = (script as u64) << 32 | (lang as u64);
if let Some(lookup) = lookup_binary(key, &tables::LANG_SCRIPT) {
apply_lookup(self, lookup);
return true;
}
if let Some(lookup) = lookup_binary(lang, &tables::LANG_ONLY) {
apply_lookup(self, lookup);
return true;
}
let key = (script as u64) << 32; // und-script
if let Some(lookup) = lookup_binary(key, &tables::LANG_SCRIPT) {
apply_lookup(self, lookup);
return true;
}
false
}
}

#[cfg(test)]
mod tests {
use crate::Locale;

fn a(fr: &str, to: &str) {
let mut loc = Locale::from(fr);
loc.add_likely_subtags();
assert_eq!(loc.to_string(), to);
}

#[test]
fn add_likely() {
a("en-Zzzz-US", "en-Latn-US");
a("en-ZZ", "en-Latn-US");
a("und-Arab-CC", "ms-Arab-CC");
a("und-Hebr-GB", "yi-Hebr-GB");
a("yi-GB", "yi-Hebr-GB");
a("az", "az-Latn-AZ");
a("az-IS", "az-Latn-IS");
a("az-IQ", "az-Arab-IQ");
a("az-RU", "az-Cyrl-RU");
a("az-Arab", "az-Arab-IR");
a("zh-CN", "zh-Hans-CN");
a("zh-HK", "zh-Hant-HK");
a("und-Adlm", "ff-Adlm-GN");
a("und-Adlm-IS", "is-Adlm-IS");
a("und-Adlm-IO", "ff-Adlm-IO");
a("und-CN", "zh-Hans-CN");
a("en-Shaw", "en-Shaw-GB");
a("ZH-ZZZZ-SG", "zh-Hans-SG"); // example from spec
}

#[test]
fn add_likely_consistent_with_shortcuts() {
// Make sure this logic is consistent with the shortcuts in the negotiate module.
a("az", "az-Latn-AZ");
a("bg", "bg-Cyrl-BG");
a("cs", "cs-Latn-CZ");
a("de", "de-Latn-DE");
a("en", "en-Latn-US");
a("es", "es-Latn-ES");
a("fi", "fi-Latn-FI");
a("fr", "fr-Latn-FR");
a("hu", "hu-Latn-HU");
a("it", "it-Latn-IT");
a("lt", "lt-Latn-LT");
a("lv", "lv-Latn-LV");
a("nl", "nl-Latn-NL");
a("pl", "pl-Latn-PL");
a("ro", "ro-Latn-RO");
a("ru", "ru-Cyrl-RU");
a("sr", "sr-Cyrl-RS");
a("sr-RU", "sr-Latn-RU");
a("az-IR", "az-Arab-IR");
a("zh-GB", "zh-Hant-GB");
a("zh-US", "zh-Hant-US");
}
}
3 changes: 3 additions & 0 deletions src/locale/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ use std::fmt;
mod options;
mod parser;

#[cfg(feature = "likely-subtags")]
mod likely_subtags;

/// A Locale object.
///
/// Locale object stores information encoded in a language tag and provides
Expand Down
Loading