Skip to content

Add Russian name official order handling #154

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 23 additions & 16 deletions nameparser/config/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
"""
The :py:mod:`nameparser.config` module manages the configuration of the
nameparser.
nameparser.

A module-level instance of :py:class:`~nameparser.config.Constants` is created
and used by default for all HumanName instances. You can adjust the entire module's
Expand All @@ -25,7 +25,7 @@
>>> hn.parse_full_name() # need to run this again after config changes

**Potential Gotcha**: If you do not pass ``None`` as the second argument,
``hn.C`` will be a reference to the module config, possibly yielding
``hn.C`` will be a reference to the module config, possibly yielding
unexpected results. See `Customizing the Parser <customize.html>`_.
"""
from __future__ import unicode_literals
Expand Down Expand Up @@ -57,7 +57,7 @@ class SetManager(Set):

Only special functionality beyond that provided by set() is
to normalize constants for comparison (lower case, no periods)
when they are add()ed and remove()d and allow passing multiple
when they are add()ed and remove()d and allow passing multiple
string arguments to the :py:func:`add()` and :py:func:`remove()` methods.

'''
Expand Down Expand Up @@ -125,7 +125,7 @@ def remove(self, *strings):

class TupleManager(dict):
'''
A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants
A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants
more friendly.
'''

Expand All @@ -148,23 +148,23 @@ class Constants(object):
"""
An instance of this class hold all of the configuration constants for the parser.

:param set prefixes:
:param set prefixes:
:py:attr:`prefixes` wrapped with :py:class:`SetManager`.
:param set titles:
:param set titles:
:py:attr:`titles` wrapped with :py:class:`SetManager`.
:param set first_name_titles:
:param set first_name_titles:
:py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`.
:param set suffix_acronyms:
:param set suffix_acronyms:
:py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`.
:param set suffix_not_acronyms:
:param set suffix_not_acronyms:
:py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`.
:param set conjunctions:
:param set conjunctions:
:py:attr:`conjunctions` wrapped with :py:class:`SetManager`.
:type capitalization_exceptions: tuple or dict
:param capitalization_exceptions:
:param capitalization_exceptions:
:py:attr:`~capitalization.CAPITALIZATION_EXCEPTIONS` wrapped with :py:class:`TupleManager`.
:type regexes: tuple or dict
:param regexes:
:param regexes:
:py:attr:`regexes` wrapped with :py:class:`TupleManager`.
"""

Expand All @@ -187,17 +187,17 @@ class Constants(object):
empty_attribute_default = ''
"""
Default return value for empty attributes.

.. doctest::

>>> from nameparser.config import CONSTANTS
>>> CONSTANTS.empty_attribute_default = None
>>> name = HumanName("John Doe")
>>> name.title
None
>>>name.first
'John'

"""

capitalize_name = False
Expand Down Expand Up @@ -231,6 +231,11 @@ class Constants(object):

"""

try_russian_name_specifics = False
"""
If set, the parser will attempt to parse names in the Russian order (Last First Middle)
"""

def __init__(self,
prefixes=PREFIXES,
suffix_acronyms=SUFFIX_ACRONYMS,
Expand All @@ -239,7 +244,8 @@ def __init__(self,
first_name_titles=FIRST_NAME_TITLES,
conjunctions=CONJUNCTIONS,
capitalization_exceptions=CAPITALIZATION_EXCEPTIONS,
regexes=REGEXES
regexes=REGEXES,
try_russian_name_specifics=False,
):
self.prefixes = SetManager(prefixes)
self.suffix_acronyms = SetManager(suffix_acronyms)
Expand All @@ -249,6 +255,7 @@ def __init__(self,
self.conjunctions = SetManager(conjunctions)
self.capitalization_exceptions = TupleManager(capitalization_exceptions)
self.regexes = TupleManager(regexes)
self.try_russian_name_specifics = try_russian_name_specifics
self._pst = None

@property
Expand Down
10 changes: 8 additions & 2 deletions nameparser/config/regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
re_emoji = re.compile('['
'\U0001F300-\U0001F64F'
'\U0001F680-\U0001F6FF'
'\u2600-\u26FF\u2700-\u27BF]+',
'\u2600-\u26FF\u2700-\u27BF]+',
re.UNICODE)
except re.error:
# Narrow UCS-2 build
re_emoji = re.compile('('
'\ud83c[\udf00-\udfff]|'
'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
'[\u2600-\u26FF\u2700-\u27BF])+',
'[\u2600-\u26FF\u2700-\u27BF])+',
re.UNICODE)

REGEXES = set([
Expand All @@ -31,6 +31,12 @@
("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)),
("emoji",re_emoji),
("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)),
("russian_last_name_endings", re.compile(r'^.+(ov|ova|ev|eva|yov|yova|in|yn|ina|sky|skaya|ich|ych|uk|yuk|yk|ko|ak|ukh|ykh|ikh|chuk|yy|yi|oy|oi|iy|ii)$', re.I | re.U)),
("russian_last_name_endings_cyrillic", re.compile(r'^.+(ов|ова|ев|ева|ёв|ёва|ин|ын|ина|ский|ская|цкая|цкий|ич|ыч|ук|юк|ык|ко|ак|ух|ых|их|чук|ый|ой|ий)$', re.I | re.U)),
("russian_patronymic_endings", re.compile(r'^(.+(ovich|ovna|evich|evna|ichna))|(ilyich|kuzmich|lukich|fomich|fokich)$', re.I | re.U)),
("russian_patronymic_endings_cyrillic", re.compile(r'^(.+(ович|овна|евич|евна|ична))|(ильич|кузьмич|лукич|фомич|фокич)$', re.I | re.U)),
("turkic_patronymic_suffixes", re.compile(r'^(oglu|ogly|qizi|kizi|kyzy|gyzy|uly|uulu)$', re.I | re.U)),
("turkic_patronymic_suffixes_cyrillic", re.compile(r'^(оглу|оглы|кызы|гызы|улы|уулу)$', re.I | re.U)),
])
"""
All regular expressions used by the parser are precompiled and stored in the config.
Expand Down
74 changes: 73 additions & 1 deletion nameparser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,8 @@ def post_process(self):
and :py:func:`handle_capitalization`.
"""
self.handle_firstnames()
if self.C.try_russian_name_specifics:
self.handle_russian_name_specifics()
self.handle_capitalization()

def fix_phd(self):
Expand Down Expand Up @@ -568,6 +570,76 @@ def handle_firstnames(self):
and not lc(self.title) in self.C.first_name_titles:
self.last, self.first = self.first, self.last

def is_turkic_patronymic(self, piece):
return self.C.regexes.turkic_patronymic_suffixes.match(piece) or self.C.regexes.turkic_patronymic_suffixes_cyrillic.match(piece)

def handle_russian_name_specifics(self):
# Russian name order may have a last name first,
# so the order will be Last First Middle instead of First Middle Last (but without comma!)
# We can deduce this by checking EITHER if the first name looks like a russian last name,
# (but it currently breaks on names like Martin or Franklin or Benjamin - hence extra config parameter)
# OR if the last name looks like a russian patronymic
# (but it will break on name without patronymic and foreign last name like Olurombi Alexey <- Last First order),
# Another case: Last First instead of First Last. Then middle is empty.
is_name_order_lfm = (self.first and self.is_russian_last_name(self.first)) or (
# if the middle name also looks like a russian patronymic, then it's a First Middle Last order,
# e.g. Roman Alexeevich Abramovich <- Abramovich does look like patronymic, but it's really a last name
self.last and self.is_russian_patronymic(self.last) and not self.is_russian_patronymic(self.middle)
) or ( # some Russian citizens have patronymics of turkic origin, e.g. Said Ogly
self.last and self.is_turkic_patronymic(self.last)
)

# rare case: last name consists of two or more words separated by space
# one of them got incorrectly parsed as first/middle name,
# Russian middle names are patronymics, and consist of one word only
if len(self.middle_list) > 1:
# exception to this rule: turkic origin patronymics (e.g. Said Ogly <- two pieces!)
if is_name_order_lfm:
if self.is_turkic_patronymic(self.last):
# e.g "Ahmedov Oktay Said Ogly" <- Said should be moved to Ogly
self.last_list = self.middle_list[1:] + self.last_list
self.middle_list = [self.middle_list[0]]
else:
# then the second word gets parsed as middle name (if the last name goes first in the user input)
# take all elements of middle_list except the last one and append them to first_list
# (it will be rotated to last_list)
self.first_list += self.middle_list[:-1]
# the last element of middle_list is the new middle name (will be rotated to first_list)
self.middle_list = [self.middle_list[-1]]
else:
if self.is_turkic_patronymic(self.middle_list[-1]):
pass # no specific treatment needed
else:
# if the last name goes last in the user input, then all parts except the last get parsed as middle name
# fix that
self.last_list = self.middle_list[1:] + self.last_list
self.middle_list = [self.middle_list[0]]

if is_name_order_lfm:
# # which is parsed as last name but should be in middle name
if self.middle:
# rotate the name components
self.first, self.middle, self.last = self.middle, self.last, self.first
else:
self.first, self.last = self.last, self.first

def is_russian_last_name(self, piece):
"""
If the last name ends in a slavic suffix, it's a last name.
"""
# some first names match these regexes, so we check them first
if piece.lower() in ['lev', 'eva', 'yacov', 'yakov', 'veniamin',
'lyubov', 'lubov', 'nina',
'лев', 'ева', 'яков', 'вениамин',
'нина']:
return False
if self.C.regexes.russian_last_name_endings.match(piece) or self.C.regexes.russian_last_name_endings_cyrillic.match(piece):
return True
return False

def is_russian_patronymic(self, piece):
return self.C.regexes.russian_patronymic_endings.match(piece) or self.C.regexes.russian_patronymic_endings_cyrillic.match(piece)

def parse_full_name(self):
"""

Expand Down Expand Up @@ -764,7 +836,7 @@ def parse_pieces(self, parts, additional_parts_count=0):
titles = list(filter(self.is_title, period_chunks))
suffixes = list(filter(self.is_suffix, period_chunks))

# add the part to the constant so it will be found
# add the part to the constant so it will be found
if len(list(titles)):
self.C.titles.add(part)
continue
Expand Down
81 changes: 81 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2387,6 +2387,87 @@ def test_constructor_multiple(self):
self.m(hn.title, "mytitle", hn)


class RussianNameOrderTestCase(HumanNameTestBase):
C = Constants(try_russian_name_specifics=True)

def test_russian_name_specific_order(self):
hn = HumanName("Zarubkin Alexander Sergeevich", constants=self.C)
self.m(hn.first, "Alexander", hn)
self.m(hn.middle, "Sergeevich", hn)
self.m(hn.last, "Zarubkin", hn)

def test_specific_order_without_patronymic(self):
hn = HumanName("Zarubkin Alexander", constants=self.C)
self.m(hn.first, "Alexander", hn)
self.m(hn.last, "Zarubkin", hn)

def test_last_name_with_dash_specific_order(self):
hn = HumanName("Blokin-Mechtalin Konstantin Yurievich", constants=self.C)
self.m(hn.first, "Konstantin", hn)
self.m(hn.middle, "Yurievich", hn)
self.m(hn.last, "Blokin-Mechtalin", hn)

def test_russian_name_with_african_origin(self):
hn = HumanName("Alexey Richardovich Olurombi Akinwale", constants=self.C)
self.m(hn.first, "Alexey", hn)
self.m(hn.middle, "Richardovich", hn)
self.m(hn.last, "Olurombi Akinwale", hn)

def test_russian_name_specific_order_with_african_origin(self):
hn = HumanName("Olurombi Akinwale Alexey Richardovich", constants=self.C)
self.m(hn.first, "Alexey", hn)
self.m(hn.middle, "Richardovich", hn)
self.m(hn.last, "Olurombi Akinwale", hn)

def test_last_name_like_russian_patronymic(self):
hn = HumanName("Sergey Vitalyevich Petsevich", constants=self.C)
self.m(hn.first, "Sergey", hn)
self.m(hn.middle, "Vitalyevich", hn)
self.m(hn.last, "Petsevich", hn)

def test_last_name_like_russian_patronymic_specific_order(self):
hn = HumanName("Petsevich Sergey Vitalyevich", constants=self.C)
self.m(hn.first, "Sergey", hn)
self.m(hn.middle, "Vitalyevich", hn)
self.m(hn.last, "Petsevich", hn)

def test_turkic_patronymic(self):
hn = HumanName("Leyla Said Gyzy Ahmedova", constants=self.C)
self.m(hn.first, "Leyla", hn)
self.m(hn.middle, "Said Gyzy", hn)
self.m(hn.last, "Ahmedova", hn)

def test_turkic_patronymic_specific_order(self):
hn = HumanName("Ahmedova Leyla Said Gyzy", constants=self.C)
self.m(hn.first, "Leyla", hn)
self.m(hn.middle, "Said Gyzy", hn)
self.m(hn.last, "Ahmedova", hn)

# these surnames end with -y (-ый/-ий in Russian) which I would rather not add to the Russian last names endings list
# as the resulting regex would be too broad
# However, if the first name is followed by patronymic, it will be caught and parsed properly
# If it is transliterated as -yi/-yy/-iy/-ii instead of -y, it will also be recognized properly
# It's a shame the usual transliteration of -ый/-ий to English is -y (e.g. Sikorsky)
# I guess it follows the rules for similar last names in Polish language.
# Most popular endings for -y: -ский/-цкий (-sky/-tsky) are already covered, but corner cases like this one remain.
@unittest.expectedFailure
def test_tricky_case1(self):
hn = HumanName("Mogilny Alexander", constants=self.C) # famous hockey player
self.m(hn.first, "Alexander", hn)
self.m(hn.last, "Mogilny", hn)

def test_tricky_case2(self):
hn = HumanName("Mogilny Alexander Gennadyevich", constants=self.C) # famous hockey player
self.m(hn.first, "Alexander", hn)
self.m(hn.middle, "Gennadyevich", hn)
self.m(hn.last, "Mogilny", hn)

def test_tricky_case3(self):
hn = HumanName("Mogilnyy Alexander", constants=self.C) # famous hockey player
self.m(hn.first, "Alexander", hn)
self.m(hn.last, "Mogilnyy", hn)


TEST_NAMES = (
"John Doe",
"John Doe, Jr.",
Expand Down