JabRef · oscargus · Apr 10, 2016 · Apr 8, 2016
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -58,6 +58,7 @@ to [sourceforge feature requests](https://sourceforge.net/p/jabref/features/) by
 - Move PDF file directory configuration from external tab to file tab in preferences
 
 ### Fixed
+- Fixed [#318](https://github.com/JabRef/jabref/issues/318): Improve normalization of author names
 - Fixed [#598](https://github.com/JabRef/jabref/issues/598) and [#402](https://github.com/JabRef/jabref/issues/402): No more issues with invalid icons for ExternalFileTypes in global search or after editing the settings
 - Fixed [#883](https://github.com/JabRef/jabref/issues/883): No NPE during cleanup
 - Fixed [#845](https://github.com/JabRef/jabref/issues/845): Add checkboxes for highlighting in groups menu, fixes other toggle highlighting as well for all toggle buttons

diff --git a/src/main/java/net/sf/jabref/logic/formatter/bibtexfields/NormalizeNamesFormatter.java b/src/main/java/net/sf/jabref/logic/formatter/bibtexfields/NormalizeNamesFormatter.java
@@ -17,19 +17,12 @@
 
 import net.sf.jabref.logic.formatter.Formatter;
 import net.sf.jabref.logic.l10n.Localization;
-
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+import net.sf.jabref.model.entry.AuthorList;
 
 /**
  * Formatter normalizing a list of person names to the BibTeX format.
  */
 public class NormalizeNamesFormatter implements Formatter {
-    private static final Pattern LAST_F_F = Pattern.compile("(\\p{javaUpperCase}[\\p{javaLowerCase}]+) (\\p{javaUpperCase}+)");
-    private static final Pattern LAST_FDOT_F = Pattern.compile("(\\p{javaUpperCase}[\\p{javaLowerCase}]+) ([\\. \\p{javaUpperCase}]+)");
-    private static final Pattern F_F_LAST = Pattern.compile("(\\p{javaUpperCase}+) (\\p{javaUpperCase}[\\p{javaLowerCase}]+)");
-    private static final Pattern FDOT_F_LAST = Pattern.compile("([\\. \\p{javaUpperCase}]+) (\\p{javaUpperCase}[\\p{javaLowerCase}]+)");
-    private static final Pattern SINGLE_NAME = Pattern.compile("(\\p{javaUpperCase}[\\p{javaLowerCase}]*)");
 
     @Override
     public String getName() {
@@ -43,210 +36,13 @@ public String getKey() {
 
     @Override
     public String format(String value) {
-        boolean andSep = false;
-        // String can contain newlines. Convert each to a space
-        String noNewlineValue = value.replace("\n", " ");
-        String[] authors = noNewlineValue.split("( |,)and ", -1);
-        if (authors.length > 1) {
-            andSep = true;
-        } else {
-            /*
-            If there are no "and" separators in the original string, we assume it either means that
-            the author list is comma or semicolon separated or that it contains only a single name.
-            If there is a semicolon, we go by that. If not, we assume commas, and count the parts
-            separated by commas to determine which it is.
-            */
-            String[] authors2 = noNewlineValue.split("; ");
-            if (authors2.length > 1) {
-                authors = authors2;
-            } else {
-                authors2 = noNewlineValue.split(", ");
-                if (authors2.length > 3) { // Probably more than a single author, so we split by commas.
-                    authors = authors2;
-                } else {
-                    if (authors2.length == 3) {
-                        // This could be a BibTeX formatted name containing a Jr particle,
-                        // e.g. Smith, Jr., Peter
-                        // We check if the middle part is <= 3 characters. If not, we assume we are
-                        // dealing with three authors.
-                        if (authors2[1].length() > 3) {
-                            authors = authors2;
-                        }
-                    }
-                }
-            }
-        }
-
-        // Remove leading and trailing whitespaces from each name:
-        for (int i = 0; i < authors.length; i++) {
-            authors[i] = authors[i].trim();
-        }
-
-        // If we found an and separator, there could possibly be semicolon or
-        // comma separation before the last separator. If there are two or more
-        // and separators, we can dismiss this possibility.
-        // If there is only a single and separator, check closer:
-        if (andSep && (authors.length == 2)) {
-            // Check if the first part is semicolon separated:
-            String[] semiSep = authors[0].split("; ");
-            if (semiSep.length > 1) {
-                // Ok, it looks like this is the case. Use separation by semicolons:
-                String[] newAuthors = new String[1 + semiSep.length];
-                for (int i = 0; i < semiSep.length; i++) {
-                    newAuthors[i] = semiSep[i].trim();
-                }
-                newAuthors[semiSep.length] = authors[1];
-                authors = newAuthors;
-            } else {
-                // Check if there is a comma in the last name. If so, we can assume that comma
-                // is not used to separate the names:
-                boolean lnfn = authors[1].indexOf(',') >= 1;
-                if (!lnfn) {
-                    String[] cmSep = authors[0].split(", ");
-                    if (cmSep.length > 1) {
-                        // This means that the last name doesn't contain a comma, but the first
-                        // one contains one or more. This indicates that the names leading up to
-                        // the single "and" are comma separated:
-                        String[] newAuthors = new String[1 + cmSep.length];
-                        for (int i = 0; i < cmSep.length; i++) {
-                            newAuthors[i] = cmSep[i].trim();
-                        }
-                        newAuthors[cmSep.length] = authors[1];
-                        authors = newAuthors;
-                    }
-
-                }
-            }
-        }
-
-        StringBuilder stringBuilder = new StringBuilder();
-        for (int i = 0; i < authors.length; i++) {
-            String norm = NormalizeNamesFormatter.normalizeName(authors[i]);
-            stringBuilder.append(norm);
-            if (i < (authors.length - 1)) {
-                stringBuilder.append(" and ");
-            }
-        }
-        return stringBuilder.toString();
+        AuthorList authorList = AuthorList.parse(value);
+        return authorList.getAsLastFirstNamesWithAnd(false);
     }
 
     @Override
     public String getDescription() {
         return Localization.lang("Normalizes lists of persons to the BibTeX standard.");
     }
 
-    private static String normalizeName(String oldName) {
-        String name = oldName;
-        Matcher matcher = NormalizeNamesFormatter.LAST_F_F.matcher(name);
-        if (matcher.matches()) {
-            String initials = matcher.group(2);
-            StringBuilder stringBuilder = new StringBuilder(matcher.group(1));
-            stringBuilder.append(", ");
-            fixInitials(initials, stringBuilder);
-            return stringBuilder.toString();
-        }
-        matcher = NormalizeNamesFormatter.LAST_FDOT_F.matcher(name);
-        if (matcher.matches()) {
-            String initials = matcher.group(2).replaceAll("[\\. ]+", "");
-            StringBuilder stringBuilder = new StringBuilder(matcher.group(1));
-            stringBuilder.append(", ");
-            fixInitials(initials, stringBuilder);
-            return stringBuilder.toString();
-        }
-
-        matcher = NormalizeNamesFormatter.F_F_LAST.matcher(name);
-        if (matcher.matches()) {
-            String initials = matcher.group(1);
-            StringBuilder stringBuilder = new StringBuilder(matcher.group(2));
-            stringBuilder.append(", ");
-            fixInitials(initials, stringBuilder);
-            return stringBuilder.toString();
-        }
-        matcher = NormalizeNamesFormatter.FDOT_F_LAST.matcher(name);
-        if (matcher.matches()) {
-            String initials = matcher.group(1).replaceAll("[\\. ]+", "");
-            StringBuilder stringBuilder = new StringBuilder(matcher.group(2));
-            stringBuilder.append(", ");
-            fixInitials(initials, stringBuilder);
-            return stringBuilder.toString();
-        }
-
-        if (name.indexOf(',') >= 0) {
-            // Name contains comma
-            int index = name.lastIndexOf(',');
-            // If the comma is at the end of the name, just remove it to prevent index error:
-            if (index == (name.length() - 1)) {
-                name = name.substring(0, name.length() - 1);
-            }
-
-            StringBuilder stringBuilder = new StringBuilder(name.substring(0, index));
-            stringBuilder.append(", ");
-            // Check if the remainder is a single name:
-            String firstName = name.substring(index + 1).trim();
-            String[] firstNameParts = firstName.split(" ");
-            if (firstNameParts.length > 1) {
-                // Multiple parts. Add all of them, and add a dot if they are single letter parts:
-                for (int i = 0; i < firstNameParts.length; i++) {
-                    if (firstNameParts[i].length() == 1) {
-                        stringBuilder.append(firstNameParts[i]).append('.');
-                    } else {
-                        stringBuilder.append(firstNameParts[i]);
-                    }
-                    if (i < (firstNameParts.length - 1)) {
-                        stringBuilder.append(' ');
-                    }
-                }
-            } else {
-                // Only a single part. Check if it looks like a name or initials:
-                Matcher nameMatcher = NormalizeNamesFormatter.SINGLE_NAME.matcher(firstNameParts[0]);
-                if (nameMatcher.matches()) {
-                    stringBuilder.append(firstNameParts[0]);
-                } else {
-                    // It looks like initials.
-                    String initials = firstNameParts[0].replaceAll("[\\.]+", "");
-                    fixInitials(initials, stringBuilder);
-                }
-
-            }
-            return stringBuilder.toString();
-        } else {
-            // Name doesn't contain comma
-            String[] parts = name.split(" +");
-            boolean allNames = true;
-            for (String part : parts) {
-                matcher = NormalizeNamesFormatter.SINGLE_NAME.matcher(part);
-                if (!matcher.matches()) {
-                    allNames = false;
-                    break;
-                }
-            }
-            if (allNames) {
-                // Looks like a name written in full with first name first.
-                // Change into last name first format:
-                StringBuilder stringBuilder = new StringBuilder(parts[parts.length - 1]);
-                if (parts.length > 1) {
-                    stringBuilder.append(',');
-                    for (int i = 0; i < (parts.length - 1); i++) {
-                        stringBuilder.append(' ').append(parts[i]);
-                        if (parts[i].length() == 1) {
-                            stringBuilder.append('.');
-                        }
-                    }
-                }
-                return stringBuilder.toString();
-            }
-        }
-
-        return name;
-    }
-
-    private static void fixInitials(final String initials, final StringBuilder stringBuilder) {
-        for (int i = 0; i < initials.length(); i++) {
-            stringBuilder.append(initials.charAt(i));
-            stringBuilder.append('.');
-            if (i < (initials.length() - 1)) {
-                stringBuilder.append(' ');
-            }
-        }
-    }
 }
diff --git a/src/main/java/net/sf/jabref/logic/util/strings/StringUtil.java b/src/main/java/net/sf/jabref/logic/util/strings/StringUtil.java
@@ -21,6 +21,7 @@
 
 import com.google.common.base.CharMatcher;
 import net.sf.jabref.Globals;
+import net.sf.jabref.model.entry.Author;
 
 public class StringUtil {
 
@@ -641,7 +642,7 @@ public static String expandAuthorInitials(String name) {
                 }
                 for (int j = 1; j < names.length; j++) {
                     if (j == 1) {
-                        sb.append(StringUtil.expandAll(names[j]));
+                        sb.append(Author.addDotIfAbbreviation(names[j]));
                     } else {
                         sb.append(names[j]);
                     }
@@ -653,7 +654,7 @@ public static String expandAuthorInitials(String name) {
             } else {
                 String[] names = authors[i].split(" ");
                 if (names.length > 0) {
-                    sb.append(StringUtil.expandAll(names[0]));
+                    sb.append(Author.addDotIfAbbreviation(names[0]));
                 }
                 for (int j = 1; j < names.length; j++) {
                     sb.append(' ');
@@ -668,48 +669,4 @@ public static String expandAuthorInitials(String name) {
         return sb.toString().trim();
     }
 
-    private static String expandAll(String s) {
-        // Avoid arrayindexoutof.... :
-        if (s.isEmpty()) {
-            return s;
-        }
-        // If only one character (uppercase letter), add a dot and return immediately:
-        if ((s.length() == 1) && Character.isLetter(s.charAt(0)) &&
-                Character.isUpperCase(s.charAt(0))) {
-            return s + ".";
-        }
-        StringBuilder sb = new StringBuilder();
-        char c = s.charAt(0);
-        char d = 0;
-        for (int i = 1; i < s.length(); i++) {
-            d = s.charAt(i);
-            if (Character.isLetter(c) && Character.isUpperCase(c) &&
-                    Character.isLetter(d) && Character.isUpperCase(d)) {
-                // AA -> A. A.
-                sb.append(c);
-                sb.append(". ");
-            } else if (Character.isLetter(c) && Character.isUpperCase(c) &&
-                    ('-' == d)) {
-                // A-A -> A.-A.
-                sb.append(c);
-                sb.append(".");
-            } else if ((c == '.') && Character.isLetter(d)
-                    && Character.isUpperCase(d)) {
-                // A.A. -> A. A.
-                sb.append(". ");
-            } else {
-                sb.append(c);
-            }
-            c = d;
-        }
-        if (Character.isLetter(c) && Character.isUpperCase(c) &&
-                Character.isLetter(d) && Character.isUpperCase(d)) {
-            sb.append(c);
-            sb.append(". ");
-        } else {
-            sb.append(c);
-        }
-        return sb.toString().trim();
-    }
-
 }