derek73 · me21 · May 27, 2024 · May 27, 2024 · May 28, 2024
diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
 The :py:mod:`nameparser.config` module manages the configuration of the
-nameparser. 
+nameparser.
 
 A module-level instance of :py:class:`~nameparser.config.Constants` is created
 and used by default for all HumanName instances. You can adjust the entire module's
@@ -25,7 +25,7 @@
     >>> hn.parse_full_name() # need to run this again after config changes
 
 **Potential Gotcha**: If you do not pass ``None`` as the second argument,
-``hn.C`` will be a reference to the module config, possibly yielding 
+``hn.C`` will be a reference to the module config, possibly yielding
 unexpected results. See `Customizing the Parser <customize.html>`_.
 """
 from __future__ import unicode_literals
@@ -57,7 +57,7 @@ class SetManager(Set):
 
     Only special functionality beyond that provided by set() is
     to normalize constants for comparison (lower case, no periods)
-    when they are add()ed and remove()d and allow passing multiple 
+    when they are add()ed and remove()d and allow passing multiple
     string arguments to the :py:func:`add()` and :py:func:`remove()` methods.
 
     '''
@@ -125,7 +125,7 @@ def remove(self, *strings):
 
 class TupleManager(dict):
     '''
-    A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants 
+    A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants
     more friendly.
     '''
 
@@ -148,23 +148,23 @@ class Constants(object):
     """
     An instance of this class hold all of the configuration constants for the parser.
 
-    :param set prefixes: 
+    :param set prefixes:
         :py:attr:`prefixes` wrapped with :py:class:`SetManager`.
-    :param set titles: 
+    :param set titles:
         :py:attr:`titles` wrapped with :py:class:`SetManager`.
-    :param set first_name_titles: 
+    :param set first_name_titles:
         :py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`.
-    :param set suffix_acronyms: 
+    :param set suffix_acronyms:
         :py:attr:`~suffixes.SUFFIX_ACRONYMS`  wrapped with :py:class:`SetManager`.
-    :param set suffix_not_acronyms: 
+    :param set suffix_not_acronyms:
         :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS`  wrapped with :py:class:`SetManager`.
-    :param set conjunctions: 
+    :param set conjunctions:
         :py:attr:`conjunctions`  wrapped with :py:class:`SetManager`.
     :type capitalization_exceptions: tuple or dict
-    :param capitalization_exceptions: 
+    :param capitalization_exceptions:
         :py:attr:`~capitalization.CAPITALIZATION_EXCEPTIONS` wrapped with :py:class:`TupleManager`.
     :type regexes: tuple or dict
-    :param regexes: 
+    :param regexes:
         :py:attr:`regexes`  wrapped with :py:class:`TupleManager`.
     """
 
@@ -187,17 +187,17 @@ class Constants(object):
     empty_attribute_default = ''
     """
     Default return value for empty attributes.
-    
+
     .. doctest::
-    
+
         >>> from nameparser.config import CONSTANTS
         >>> CONSTANTS.empty_attribute_default = None
         >>> name = HumanName("John Doe")
         >>> name.title
         None
         >>>name.first
         'John'
-        
+
     """
 
     capitalize_name = False
@@ -231,6 +231,11 @@ class Constants(object):
 
     """
 
+    try_russian_name_specifics = False
+    """
+    If set, the parser will attempt to parse names in the Russian order (Last First Middle)
+    """
+
     def __init__(self,
                  prefixes=PREFIXES,
                  suffix_acronyms=SUFFIX_ACRONYMS,
@@ -239,7 +244,8 @@ def __init__(self,
                  first_name_titles=FIRST_NAME_TITLES,
                  conjunctions=CONJUNCTIONS,
                  capitalization_exceptions=CAPITALIZATION_EXCEPTIONS,
-                 regexes=REGEXES
+                 regexes=REGEXES,
+                 try_russian_name_specifics=False,
                  ):
         self.prefixes = SetManager(prefixes)
         self.suffix_acronyms = SetManager(suffix_acronyms)
@@ -249,6 +255,7 @@ def __init__(self,
         self.conjunctions = SetManager(conjunctions)
         self.capitalization_exceptions = TupleManager(capitalization_exceptions)
         self.regexes = TupleManager(regexes)
+        self.try_russian_name_specifics = try_russian_name_specifics
         self._pst = None
 
     @property

diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py
@@ -8,14 +8,14 @@
     re_emoji = re.compile('['
         '\U0001F300-\U0001F64F'
         '\U0001F680-\U0001F6FF'
-        '\u2600-\u26FF\u2700-\u27BF]+', 
+        '\u2600-\u26FF\u2700-\u27BF]+',
         re.UNICODE)
 except re.error:
     # Narrow UCS-2 build
     re_emoji = re.compile('('
         '\ud83c[\udf00-\udfff]|'
         '\ud83d[\udc00-\ude4f\ude80-\udeff]|'
-        '[\u2600-\u26FF\u2700-\u27BF])+', 
+        '[\u2600-\u26FF\u2700-\u27BF])+',
         re.UNICODE)
 
 REGEXES = set([
@@ -31,6 +31,12 @@
     ("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)),
     ("emoji",re_emoji),
     ("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)),
+    ("russian_last_name_endings", re.compile(r'^.+(ov|ova|ev|eva|yov|yova|in|yn|ina|sky|skaya|ich|ych|uk|yuk|yk|ko|ak|ukh|ykh|ikh|chuk|yy|yi|oy|oi|iy|ii)$', re.I | re.U)),
+    ("russian_last_name_endings_cyrillic", re.compile(r'^.+(ов|ова|ев|ева|ёв|ёва|ин|ын|ина|ский|ская|цкая|цкий|ич|ыч|ук|юк|ык|ко|ак|ух|ых|их|чук|ый|ой|ий)$', re.I | re.U)),
+    ("russian_patronymic_endings", re.compile(r'^(.+(ovich|ovna|evich|evna|ichna))|(ilyich|kuzmich|lukich|fomich|fokich)$', re.I | re.U)),
+    ("russian_patronymic_endings_cyrillic", re.compile(r'^(.+(ович|овна|евич|евна|ична))|(ильич|кузьмич|лукич|фомич|фокич)$', re.I | re.U)),
+    ("turkic_patronymic_suffixes", re.compile(r'^(oglu|ogly|qizi|kizi|kyzy|gyzy|uly|uulu)$', re.I | re.U)),
+    ("turkic_patronymic_suffixes_cyrillic", re.compile(r'^(оглу|оглы|кызы|гызы|улы|уулу)$', re.I | re.U)),
 ])
 """
 All regular expressions used by the parser are precompiled and stored in the config.

diff --git a/nameparser/parser.py b/nameparser/parser.py
@@ -512,6 +512,8 @@ def post_process(self):
         and :py:func:`handle_capitalization`.
         """
         self.handle_firstnames()
+        if self.C.try_russian_name_specifics:
+            self.handle_russian_name_specifics()
         self.handle_capitalization()
 
     def fix_phd(self):
@@ -568,6 +570,76 @@ def handle_firstnames(self):
                 and not lc(self.title) in self.C.first_name_titles:
             self.last, self.first = self.first, self.last
 
+    def is_turkic_patronymic(self, piece):
+        return self.C.regexes.turkic_patronymic_suffixes.match(piece) or self.C.regexes.turkic_patronymic_suffixes_cyrillic.match(piece)
+
+    def handle_russian_name_specifics(self):
+        # Russian name order may have a last name first,
+        # so the order will be Last First Middle instead of First Middle Last (but without comma!)
+        # We can deduce this by checking EITHER if the first name looks like a russian last name,
+        # (but it currently breaks on names like Martin or Franklin or Benjamin - hence extra config parameter)
+        # OR if the last name looks like a russian patronymic
+        # (but it will break on name without patronymic and foreign last name like Olurombi Alexey <- Last First order),
+        # Another case: Last First instead of First Last. Then middle is empty.
+        is_name_order_lfm = (self.first and self.is_russian_last_name(self.first)) or (
+            # if the middle name also looks like a russian patronymic, then it's a First Middle Last order,
+            # e.g. Roman Alexeevich Abramovich <- Abramovich does look like patronymic, but it's really a last name
+            self.last and self.is_russian_patronymic(self.last) and not self.is_russian_patronymic(self.middle)
+        ) or (  # some Russian citizens have patronymics of turkic origin, e.g. Said Ogly
+            self.last and self.is_turkic_patronymic(self.last)
+        )
+
+        # rare case: last name consists of two or more words separated by space
+        # one of them got incorrectly parsed as first/middle name,
+        # Russian middle names are patronymics, and consist of one word only
+        if len(self.middle_list) > 1:
+            # exception to this rule: turkic origin patronymics (e.g. Said Ogly <- two pieces!)
+            if is_name_order_lfm:
+                if self.is_turkic_patronymic(self.last):
+                    # e.g "Ahmedov Oktay Said Ogly" <- Said should be moved to Ogly
+                    self.last_list = self.middle_list[1:] + self.last_list
+                    self.middle_list = [self.middle_list[0]]
+                else:
+                    # then the second word gets parsed as middle name (if the last name goes first in the user input)
+                    # take all elements of middle_list except the last one and append them to first_list
+                    # (it will be rotated to last_list)
+                    self.first_list += self.middle_list[:-1]
+                    # the last element of middle_list is the new middle name (will be rotated to first_list)
+                    self.middle_list = [self.middle_list[-1]]
+            else:
+                if self.is_turkic_patronymic(self.middle_list[-1]):
+                    pass  # no specific treatment needed
+                else:
+                    # if the last name goes last in the user input, then all parts except the last get parsed as middle name
+                    # fix that
+                    self.last_list = self.middle_list[1:] + self.last_list
+                    self.middle_list = [self.middle_list[0]]
+
+        if is_name_order_lfm:
+            # # which is parsed as last name but should be in middle name
+            if self.middle:
+                # rotate the name components
+                self.first, self.middle, self.last = self.middle, self.last, self.first
+            else:
+                self.first, self.last = self.last, self.first
+
+    def is_russian_last_name(self, piece):
+        """
+        If the last name ends in a slavic suffix, it's a last name.
+        """
+        # some first names match these regexes, so we check them first
+        if piece.lower() in ['lev', 'eva', 'yacov', 'yakov', 'veniamin',
+                             'lyubov', 'lubov', 'nina',
+                             'лев', 'ева', 'яков', 'вениамин',
+                             'нина']:
+            return False
+        if self.C.regexes.russian_last_name_endings.match(piece) or self.C.regexes.russian_last_name_endings_cyrillic.match(piece):
+            return True
+        return False
+
+    def is_russian_patronymic(self, piece):
+        return self.C.regexes.russian_patronymic_endings.match(piece) or self.C.regexes.russian_patronymic_endings_cyrillic.match(piece)
+
     def parse_full_name(self):
         """
 
@@ -764,7 +836,7 @@ def parse_pieces(self, parts, additional_parts_count=0):
                 titles = list(filter(self.is_title,  period_chunks))
                 suffixes = list(filter(self.is_suffix, period_chunks))
 
-                # add the part to the constant so it will be found
+                # add the part to the constant  so it will be found
                 if len(list(titles)):
                     self.C.titles.add(part)
                     continue

diff --git a/tests.py b/tests.py
@@ -2387,6 +2387,87 @@ def test_constructor_multiple(self):
         self.m(hn.title, "mytitle", hn)
 
 
+class RussianNameOrderTestCase(HumanNameTestBase):
+    C = Constants(try_russian_name_specifics=True)
+
+    def test_russian_name_specific_order(self):
+        hn = HumanName("Zarubkin Alexander Sergeevich", constants=self.C)
+        self.m(hn.first, "Alexander", hn)
+        self.m(hn.middle, "Sergeevich", hn)
+        self.m(hn.last, "Zarubkin", hn)
+
+    def test_specific_order_without_patronymic(self):
+        hn = HumanName("Zarubkin Alexander", constants=self.C)
+        self.m(hn.first, "Alexander", hn)
+        self.m(hn.last, "Zarubkin", hn)
+
+    def test_last_name_with_dash_specific_order(self):
+        hn = HumanName("Blokin-Mechtalin Konstantin Yurievich", constants=self.C)
+        self.m(hn.first, "Konstantin", hn)
+        self.m(hn.middle, "Yurievich", hn)
+        self.m(hn.last, "Blokin-Mechtalin", hn)
+
+    def test_russian_name_with_african_origin(self):
+        hn = HumanName("Alexey Richardovich Olurombi Akinwale", constants=self.C)
+        self.m(hn.first, "Alexey", hn)
+        self.m(hn.middle, "Richardovich", hn)
+        self.m(hn.last, "Olurombi Akinwale", hn)
+
+    def test_russian_name_specific_order_with_african_origin(self):
+        hn = HumanName("Olurombi Akinwale Alexey Richardovich", constants=self.C)
+        self.m(hn.first, "Alexey", hn)
+        self.m(hn.middle, "Richardovich", hn)
+        self.m(hn.last, "Olurombi Akinwale", hn)
+
+    def test_last_name_like_russian_patronymic(self):
+        hn = HumanName("Sergey Vitalyevich Petsevich", constants=self.C)
+        self.m(hn.first, "Sergey", hn)
+        self.m(hn.middle, "Vitalyevich", hn)
+        self.m(hn.last, "Petsevich", hn)
+
+    def test_last_name_like_russian_patronymic_specific_order(self):
+        hn = HumanName("Petsevich Sergey Vitalyevich", constants=self.C)
+        self.m(hn.first, "Sergey", hn)
+        self.m(hn.middle, "Vitalyevich", hn)
+        self.m(hn.last, "Petsevich", hn)
+
+    def test_turkic_patronymic(self):
+        hn = HumanName("Leyla Said Gyzy Ahmedova", constants=self.C)
+        self.m(hn.first, "Leyla", hn)
+        self.m(hn.middle, "Said Gyzy", hn)
+        self.m(hn.last, "Ahmedova", hn)
+
+    def test_turkic_patronymic_specific_order(self):
+        hn = HumanName("Ahmedova Leyla Said Gyzy", constants=self.C)
+        self.m(hn.first, "Leyla", hn)
+        self.m(hn.middle, "Said Gyzy", hn)
+        self.m(hn.last, "Ahmedova", hn)
+
+    # these surnames end with -y (-ый/-ий in Russian) which I would rather not add to the Russian last names endings list
+    # as the resulting regex would be too broad
+    # However, if the first name is followed by patronymic, it will be caught and parsed properly
+    # If it is transliterated as -yi/-yy/-iy/-ii instead of -y, it will also be recognized properly
+    # It's a shame the usual transliteration of -ый/-ий to English is -y (e.g. Sikorsky)
+    # I guess it follows the rules for similar last names in Polish language.
+    # Most popular endings for -y: -ский/-цкий (-sky/-tsky) are already covered, but corner cases like this one remain.
+    @unittest.expectedFailure
+    def test_tricky_case1(self):
+        hn = HumanName("Mogilny Alexander", constants=self.C)  # famous hockey player
+        self.m(hn.first, "Alexander", hn)
+        self.m(hn.last, "Mogilny", hn)
+
+    def test_tricky_case2(self):
+        hn = HumanName("Mogilny Alexander Gennadyevich", constants=self.C)  # famous hockey player
+        self.m(hn.first, "Alexander", hn)
+        self.m(hn.middle, "Gennadyevich", hn)
+        self.m(hn.last, "Mogilny", hn)
+
+    def test_tricky_case3(self):
+        hn = HumanName("Mogilnyy Alexander", constants=self.C)  # famous hockey player
+        self.m(hn.first, "Alexander", hn)
+        self.m(hn.last, "Mogilnyy", hn)
+
+
 TEST_NAMES = (
     "John Doe",
     "John Doe, Jr.",