Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace NormalizeNamesFormatter by AuthorList and so fix #318 #1149

Merged
merged 1 commit into from
Apr 10, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ to [sourceforge feature requests](https://sourceforge.net/p/jabref/features/) by
- Move PDF file directory configuration from external tab to file tab in preferences

### Fixed
- Fixed [#318](https://github.com/JabRef/jabref/issues/318): Improve normalization of author names
- Fixed [#598](https://github.com/JabRef/jabref/issues/598) and [#402](https://github.com/JabRef/jabref/issues/402): No more issues with invalid icons for ExternalFileTypes in global search or after editing the settings
- Fixed [#883](https://github.com/JabRef/jabref/issues/883): No NPE during cleanup
- Fixed [#845](https://github.com/JabRef/jabref/issues/845): Add checkboxes for highlighting in groups menu, fixes other toggle highlighting as well for all toggle buttons
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,12 @@

import net.sf.jabref.logic.formatter.Formatter;
import net.sf.jabref.logic.l10n.Localization;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sf.jabref.model.entry.AuthorList;

/**
* Formatter normalizing a list of person names to the BibTeX format.
*/
public class NormalizeNamesFormatter implements Formatter {
private static final Pattern LAST_F_F = Pattern.compile("(\\p{javaUpperCase}[\\p{javaLowerCase}]+) (\\p{javaUpperCase}+)");
private static final Pattern LAST_FDOT_F = Pattern.compile("(\\p{javaUpperCase}[\\p{javaLowerCase}]+) ([\\. \\p{javaUpperCase}]+)");
private static final Pattern F_F_LAST = Pattern.compile("(\\p{javaUpperCase}+) (\\p{javaUpperCase}[\\p{javaLowerCase}]+)");
private static final Pattern FDOT_F_LAST = Pattern.compile("([\\. \\p{javaUpperCase}]+) (\\p{javaUpperCase}[\\p{javaLowerCase}]+)");
private static final Pattern SINGLE_NAME = Pattern.compile("(\\p{javaUpperCase}[\\p{javaLowerCase}]*)");

@Override
public String getName() {
Expand All @@ -43,210 +36,13 @@ public String getKey() {

@Override
public String format(String value) {
boolean andSep = false;
// String can contain newlines. Convert each to a space
String noNewlineValue = value.replace("\n", " ");
String[] authors = noNewlineValue.split("( |,)and ", -1);
if (authors.length > 1) {
andSep = true;
} else {
/*
If there are no "and" separators in the original string, we assume it either means that
the author list is comma or semicolon separated or that it contains only a single name.
If there is a semicolon, we go by that. If not, we assume commas, and count the parts
separated by commas to determine which it is.
*/
String[] authors2 = noNewlineValue.split("; ");
if (authors2.length > 1) {
authors = authors2;
} else {
authors2 = noNewlineValue.split(", ");
if (authors2.length > 3) { // Probably more than a single author, so we split by commas.
authors = authors2;
} else {
if (authors2.length == 3) {
// This could be a BibTeX formatted name containing a Jr particle,
// e.g. Smith, Jr., Peter
// We check if the middle part is <= 3 characters. If not, we assume we are
// dealing with three authors.
if (authors2[1].length() > 3) {
authors = authors2;
}
}
}
}
}

// Remove leading and trailing whitespaces from each name:
for (int i = 0; i < authors.length; i++) {
authors[i] = authors[i].trim();
}

// If we found an and separator, there could possibly be semicolon or
// comma separation before the last separator. If there are two or more
// and separators, we can dismiss this possibility.
// If there is only a single and separator, check closer:
if (andSep && (authors.length == 2)) {
// Check if the first part is semicolon separated:
String[] semiSep = authors[0].split("; ");
if (semiSep.length > 1) {
// Ok, it looks like this is the case. Use separation by semicolons:
String[] newAuthors = new String[1 + semiSep.length];
for (int i = 0; i < semiSep.length; i++) {
newAuthors[i] = semiSep[i].trim();
}
newAuthors[semiSep.length] = authors[1];
authors = newAuthors;
} else {
// Check if there is a comma in the last name. If so, we can assume that comma
// is not used to separate the names:
boolean lnfn = authors[1].indexOf(',') >= 1;
if (!lnfn) {
String[] cmSep = authors[0].split(", ");
if (cmSep.length > 1) {
// This means that the last name doesn't contain a comma, but the first
// one contains one or more. This indicates that the names leading up to
// the single "and" are comma separated:
String[] newAuthors = new String[1 + cmSep.length];
for (int i = 0; i < cmSep.length; i++) {
newAuthors[i] = cmSep[i].trim();
}
newAuthors[cmSep.length] = authors[1];
authors = newAuthors;
}

}
}
}

StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < authors.length; i++) {
String norm = NormalizeNamesFormatter.normalizeName(authors[i]);
stringBuilder.append(norm);
if (i < (authors.length - 1)) {
stringBuilder.append(" and ");
}
}
return stringBuilder.toString();
AuthorList authorList = AuthorList.parse(value);
return authorList.getAsLastFirstNamesWithAnd(false);
}

@Override
public String getDescription() {
return Localization.lang("Normalizes lists of persons to the BibTeX standard.");
}

private static String normalizeName(String oldName) {
String name = oldName;
Matcher matcher = NormalizeNamesFormatter.LAST_F_F.matcher(name);
if (matcher.matches()) {
String initials = matcher.group(2);
StringBuilder stringBuilder = new StringBuilder(matcher.group(1));
stringBuilder.append(", ");
fixInitials(initials, stringBuilder);
return stringBuilder.toString();
}
matcher = NormalizeNamesFormatter.LAST_FDOT_F.matcher(name);
if (matcher.matches()) {
String initials = matcher.group(2).replaceAll("[\\. ]+", "");
StringBuilder stringBuilder = new StringBuilder(matcher.group(1));
stringBuilder.append(", ");
fixInitials(initials, stringBuilder);
return stringBuilder.toString();
}

matcher = NormalizeNamesFormatter.F_F_LAST.matcher(name);
if (matcher.matches()) {
String initials = matcher.group(1);
StringBuilder stringBuilder = new StringBuilder(matcher.group(2));
stringBuilder.append(", ");
fixInitials(initials, stringBuilder);
return stringBuilder.toString();
}
matcher = NormalizeNamesFormatter.FDOT_F_LAST.matcher(name);
if (matcher.matches()) {
String initials = matcher.group(1).replaceAll("[\\. ]+", "");
StringBuilder stringBuilder = new StringBuilder(matcher.group(2));
stringBuilder.append(", ");
fixInitials(initials, stringBuilder);
return stringBuilder.toString();
}

if (name.indexOf(',') >= 0) {
// Name contains comma
int index = name.lastIndexOf(',');
// If the comma is at the end of the name, just remove it to prevent index error:
if (index == (name.length() - 1)) {
name = name.substring(0, name.length() - 1);
}

StringBuilder stringBuilder = new StringBuilder(name.substring(0, index));
stringBuilder.append(", ");
// Check if the remainder is a single name:
String firstName = name.substring(index + 1).trim();
String[] firstNameParts = firstName.split(" ");
if (firstNameParts.length > 1) {
// Multiple parts. Add all of them, and add a dot if they are single letter parts:
for (int i = 0; i < firstNameParts.length; i++) {
if (firstNameParts[i].length() == 1) {
stringBuilder.append(firstNameParts[i]).append('.');
} else {
stringBuilder.append(firstNameParts[i]);
}
if (i < (firstNameParts.length - 1)) {
stringBuilder.append(' ');
}
}
} else {
// Only a single part. Check if it looks like a name or initials:
Matcher nameMatcher = NormalizeNamesFormatter.SINGLE_NAME.matcher(firstNameParts[0]);
if (nameMatcher.matches()) {
stringBuilder.append(firstNameParts[0]);
} else {
// It looks like initials.
String initials = firstNameParts[0].replaceAll("[\\.]+", "");
fixInitials(initials, stringBuilder);
}

}
return stringBuilder.toString();
} else {
// Name doesn't contain comma
String[] parts = name.split(" +");
boolean allNames = true;
for (String part : parts) {
matcher = NormalizeNamesFormatter.SINGLE_NAME.matcher(part);
if (!matcher.matches()) {
allNames = false;
break;
}
}
if (allNames) {
// Looks like a name written in full with first name first.
// Change into last name first format:
StringBuilder stringBuilder = new StringBuilder(parts[parts.length - 1]);
if (parts.length > 1) {
stringBuilder.append(',');
for (int i = 0; i < (parts.length - 1); i++) {
stringBuilder.append(' ').append(parts[i]);
if (parts[i].length() == 1) {
stringBuilder.append('.');
}
}
}
return stringBuilder.toString();
}
}

return name;
}

private static void fixInitials(final String initials, final StringBuilder stringBuilder) {
for (int i = 0; i < initials.length(); i++) {
stringBuilder.append(initials.charAt(i));
stringBuilder.append('.');
if (i < (initials.length() - 1)) {
stringBuilder.append(' ');
}
}
}
}
49 changes: 3 additions & 46 deletions src/main/java/net/sf/jabref/logic/util/strings/StringUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import com.google.common.base.CharMatcher;
import net.sf.jabref.Globals;
import net.sf.jabref.model.entry.Author;

public class StringUtil {

Expand Down Expand Up @@ -641,7 +642,7 @@ public static String expandAuthorInitials(String name) {
}
for (int j = 1; j < names.length; j++) {
if (j == 1) {
sb.append(StringUtil.expandAll(names[j]));
sb.append(Author.addDotIfAbbreviation(names[j]));
} else {
sb.append(names[j]);
}
Expand All @@ -653,7 +654,7 @@ public static String expandAuthorInitials(String name) {
} else {
String[] names = authors[i].split(" ");
if (names.length > 0) {
sb.append(StringUtil.expandAll(names[0]));
sb.append(Author.addDotIfAbbreviation(names[0]));
}
for (int j = 1; j < names.length; j++) {
sb.append(' ');
Expand All @@ -668,48 +669,4 @@ public static String expandAuthorInitials(String name) {
return sb.toString().trim();
}

private static String expandAll(String s) {
// Avoid arrayindexoutof.... :
if (s.isEmpty()) {
return s;
}
// If only one character (uppercase letter), add a dot and return immediately:
if ((s.length() == 1) && Character.isLetter(s.charAt(0)) &&
Character.isUpperCase(s.charAt(0))) {
return s + ".";
}
StringBuilder sb = new StringBuilder();
char c = s.charAt(0);
char d = 0;
for (int i = 1; i < s.length(); i++) {
d = s.charAt(i);
if (Character.isLetter(c) && Character.isUpperCase(c) &&
Character.isLetter(d) && Character.isUpperCase(d)) {
// AA -> A. A.
sb.append(c);
sb.append(". ");
} else if (Character.isLetter(c) && Character.isUpperCase(c) &&
('-' == d)) {
// A-A -> A.-A.
sb.append(c);
sb.append(".");
} else if ((c == '.') && Character.isLetter(d)
&& Character.isUpperCase(d)) {
// A.A. -> A. A.
sb.append(". ");
} else {
sb.append(c);
}
c = d;
}
if (Character.isLetter(c) && Character.isUpperCase(c) &&
Character.isLetter(d) && Character.isUpperCase(d)) {
sb.append(c);
sb.append(". ");
} else {
sb.append(c);
}
return sb.toString().trim();
}

}
Loading