Skip to content

Commit

Permalink
[countries] use ISO 3166 country name 5% of the time for general addr…
Browse files Browse the repository at this point in the history
…esses, 10% of the time for OpenAddresses. Gives the parser examples of names like "Korea, Republic of" in #168
  • Loading branch information
albarrentine committed Mar 25, 2017
1 parent ecfa685 commit 81c59e1
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 9 deletions.
3 changes: 2 additions & 1 deletion resources/parser/data_sets/openaddresses.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
global:
cldr_country_probability: 0.5
localized_name_probability: 0.7
localized_name_probability: 0.6
iso_alpha_2_code_probability: 0.2
iso_alpha_3_code_probability: 0.1
iso_3166_name_probability: 0.1

abbreviate_street_probability: 0.3
separate_street_probability: 0.2
Expand Down
3 changes: 2 additions & 1 deletion resources/parser/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ country:
# When the user-specified country is an ISO code, remove it from the components with this probability (fall back on geocoded components)
remove_iso_code_probability: 0.1
cldr:
localized_name_probability: 0.97
localized_name_probability: 0.92
iso_alpha_2_code_probability: 0.02
iso_alpha_3_code_probability: 0.01
iso_3166_name_probability: 0.05
11 changes: 7 additions & 4 deletions scripts/geodata/addresses/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,20 +684,23 @@ def cldr_country_name(cls, country_code, language):

alpha_2_iso_code_prob = float(cldr_config['iso_alpha_2_code_probability'])
localized_name_prob = float(cldr_config['localized_name_probability'])
iso_3166_name_prob = float(cldr_config['iso_3166_name_probability'])
alpha_3_iso_code_prob = float(cldr_config['iso_alpha_3_code_probability'])

values = ('localized', 'alpha3', 'alpha2')
probs = cdf([localized_name_prob, alpha_3_iso_code_prob, alpha_2_iso_code_prob])
localized, iso_3166, alpha3, alpha2 = range(4)
probs = cdf([localized_name_prob, iso_3166_name_prob, alpha_3_iso_code_prob, alpha_2_iso_code_prob])
value = weighted_choice(values, probs)

country_name = country_code.upper()

if language in (AMBIGUOUS_LANGUAGE, UNKNOWN_LANGUAGE):
language = None

if value == 'localized':
if value == localized:
country_name = country_names.localized_name(country_code, language) or country_names.localized_name(country_code) or country_name
elif value == 'alpha3':
elif value == iso_3166:
country_name = country_names.iso_3166_name(country_code)
elif value == alpha3:
country_name = country_names.alpha3_code(country_code) or country_name

return country_name
Expand Down
6 changes: 5 additions & 1 deletion scripts/geodata/countries/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def __init__(self, base_dir=CLDR_MAIN_PATH):
self.base_dir = base_dir

self.country_alpha3_codes = {c.alpha2.lower(): c.alpha3.lower() for c in pycountry.countries}
self.iso_3166_names = {c.alpha2.lower(): c.name for c in pycountry.countries}

self.language_country_names = {}
self.country_language_names = defaultdict(dict)
Expand Down Expand Up @@ -177,7 +178,10 @@ def localized_name(self, country_code, language=None):
return self.country_language_names.get(country_code, {}).get(language)

def alpha3_code(self, alpha2_code):
alpha3 = self.country_alpha3_codes.get(alpha2_code.lower())
alpha3 = self.country_alpha3_codes.get(alpha2_code.lower())
return alpha3.upper() if alpha3 else None

def iso_3166_name(self, alpha2_code):
return self.iso_3166_names.get(alpha2_code.lower())

country_names = CountryNames()
7 changes: 5 additions & 2 deletions scripts/geodata/openaddresses/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,18 +243,21 @@ def cldr_country_name(self, country_code, language, configs):
country_name = None

if random.random() < cldr_country_prob:
localized, alpha2, alpha3 = values = range(3)
localized, iso_3166, alpha2, alpha3 = values = range(4)
localized_prob = float(self.get_property('localized_name_probability', *configs))
iso_3166_prob = float(self.get_property('iso_3166_name_probability', *configs))
alpha2_prob = float(self.get_property('iso_alpha_2_code_probability', *configs))
alpha3_prob = float(self.get_property('iso_alpha_3_code_probability', *configs))

probs = cdf([localized_prob, alpha2_prob, alpha3_prob])
probs = cdf([localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob])

country_type = weighted_choice(values, probs)

country_name = country_code.upper()
if country_type == localized:
country_name = country_names.localized_name(country_code, language) or country_names.localized_name(country_code) or country_name
elif country_type == iso_3166:
country_name = country_names.iso3166_name(country_code)
elif country_type == alpha3:
country_name = country_names.alpha3_code(country_code) or country_name

Expand Down

0 comments on commit 81c59e1

Please sign in to comment.