diff --git a/resources/parser/data_sets/openaddresses.yaml b/resources/parser/data_sets/openaddresses.yaml index 8d947c02a..2f77c0427 100644 --- a/resources/parser/data_sets/openaddresses.yaml +++ b/resources/parser/data_sets/openaddresses.yaml @@ -1,8 +1,9 @@ global: cldr_country_probability: 0.5 - localized_name_probability: 0.7 + localized_name_probability: 0.6 iso_alpha_2_code_probability: 0.2 iso_alpha_3_code_probability: 0.1 + iso_3166_name_probability: 0.1 abbreviate_street_probability: 0.3 separate_street_probability: 0.2 diff --git a/resources/parser/default.yaml b/resources/parser/default.yaml index ea9c21c1c..b20e59821 100644 --- a/resources/parser/default.yaml +++ b/resources/parser/default.yaml @@ -139,6 +139,7 @@ country: # When the user-specified country is an ISO code, remove it from the components with this probability (fall back on geocoded components) remove_iso_code_probability: 0.1 cldr: - localized_name_probability: 0.97 + localized_name_probability: 0.92 iso_alpha_2_code_probability: 0.02 iso_alpha_3_code_probability: 0.01 + iso_3166_name_probability: 0.05 diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index de4f58982..1b51c79f7 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -684,10 +684,11 @@ def cldr_country_name(cls, country_code, language): alpha_2_iso_code_prob = float(cldr_config['iso_alpha_2_code_probability']) localized_name_prob = float(cldr_config['localized_name_probability']) + iso_3166_name_prob = float(cldr_config['iso_3166_name_probability']) alpha_3_iso_code_prob = float(cldr_config['iso_alpha_3_code_probability']) - values = ('localized', 'alpha3', 'alpha2') - probs = cdf([localized_name_prob, alpha_3_iso_code_prob, alpha_2_iso_code_prob]) + localized, iso_3166, alpha3, alpha2 = range(4) + probs = cdf([localized_name_prob, iso_3166_name_prob, alpha_3_iso_code_prob, alpha_2_iso_code_prob]) value = weighted_choice(values, probs) country_name = country_code.upper() @@ -695,9 +696,11 @@ def cldr_country_name(cls, country_code, language): if language in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE): language = None - if value == 'localized': + if value == localized: country_name = country_names.localized_name(country_code, language) or country_names.localized_name(country_code) or country_name - elif value == 'alpha3': + elif value == iso_3166: + country_name = country_names.iso_3166_name(country_code) + elif value == alpha3: country_name = country_names.alpha3_code(country_code) or country_name return country_name diff --git a/scripts/geodata/countries/names.py b/scripts/geodata/countries/names.py index bff7090ca..19c4b96da 100644 --- a/scripts/geodata/countries/names.py +++ b/scripts/geodata/countries/names.py @@ -52,6 +52,7 @@ def __init__(self, base_dir=CLDR_MAIN_PATH): self.base_dir = base_dir self.country_alpha3_codes = {c.alpha2.lower(): c.alpha3.lower() for c in pycountry.countries} + self.iso_3166_names = {c.alpha2.lower(): c.name for c in pycountry.countries} self.language_country_names = {} self.country_language_names = defaultdict(dict) @@ -177,7 +178,10 @@ def localized_name(self, country_code, language=None): return self.country_language_names.get(country_code, {}).get(language) def alpha3_code(self, alpha2_code): - alpha3 = self.country_alpha3_codes.get(alpha2_code.lower()) + alpha3 = self.country_alpha3_codes.get(alpha2_code.lower()) return alpha3.upper() if alpha3 else None + def iso_3166_name(self, alpha2_code): + return self.iso_3166_names.get(alpha2_code.lower()) + country_names = CountryNames() diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 3ff296df4..ce7002cb2 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -243,18 +243,21 @@ def cldr_country_name(self, country_code, language, configs): country_name = None if random.random() < cldr_country_prob: - localized, alpha2, alpha3 = values = range(3) + localized, iso_3166, alpha2, alpha3 = values = range(4) localized_prob = float(self.get_property('localized_name_probability', *configs)) + iso_3166_prob = float(self.get_property('iso_3166_name_probability', *configs)) alpha2_prob = float(self.get_property('iso_alpha_2_code_probability', *configs)) alpha3_prob = float(self.get_property('iso_alpha_3_code_probability', *configs)) - probs = cdf([localized_prob, alpha2_prob, alpha3_prob]) + probs = cdf([localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob]) country_type = weighted_choice(values, probs) country_name = country_code.upper() if country_type == localized: country_name = country_names.localized_name(country_code, language) or country_names.localized_name(country_code) or country_name + elif country_type == iso_3166: + country_name = country_names.iso3166_name(country_code) elif country_type == alpha3: country_name = country_names.alpha3_code(country_code) or country_name