diff --git a/CHANGELOG.md b/CHANGELOG.md index a60c25b248a..912b0ac72de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,6 +42,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve - When determining the URL of an ArXiV eprint, the URL now points to the version [#8149](https://github.com/JabRef/jabref/pull/8149) - We Included all standard fields with citation key when exporting to Old OpenOffice/LibreOffice Calc Format [#8176](https://github.com/JabRef/jabref/pull/8176) - We present options to manually enter an article or return to the New Entry menu when the fetcher DOI fails to find an entry for an ID [#7870](https://github.com/JabRef/jabref/issues/7870) +- We trim white space and non-ASCII characters from DOI [#8127](https://github.com/JabRef/jabref/issues/8127) ### Fixed diff --git a/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java b/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java index 617c0985814..41afea25707 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java @@ -62,6 +62,7 @@ public Optional getHelpPage() { @Override public Optional performSearchById(String identifier) throws FetcherException { Optional doi = DOI.parse(identifier); + try { if (doi.isPresent()) { Optional fetchedEntry; @@ -70,13 +71,18 @@ public Optional performSearchById(String identifier) throws FetcherExc if (getAgency(doi.get()).isPresent() && "medra".equalsIgnoreCase(getAgency(doi.get()).get())) { return new Medra().performSearchById(identifier); } - URL doiURL = new URL(doi.get().getURIAsASCIIString()); // BibTeX data URLDownload download = getUrlDownload(doiURL); download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX); - String bibtexString = download.asString(); + String bibtexString; + try { + bibtexString = download.asString(); + } catch (IOException e) { + // an IOException will be thrown if download is unable to download from the doiURL + throw new FetcherException(Localization.lang("No DOI data exists"), e); + } // BibTeX entry fetchedEntry = BibtexParser.singleFromString(bibtexString, preferences, new DummyFileUpdateMonitor()); diff --git a/src/main/java/org/jabref/model/entry/identifier/DOI.java b/src/main/java/org/jabref/model/entry/identifier/DOI.java index 77750b8972c..f8285d6dc93 100644 --- a/src/main/java/org/jabref/model/entry/identifier/DOI.java +++ b/src/main/java/org/jabref/model/entry/identifier/DOI.java @@ -88,6 +88,14 @@ public class DOI implements Identifier { private static final Pattern FIND_SHORT_DOI_SHORTCUT = Pattern.compile(IN_TEXT_SHORT_DOI_SHORTCUT, Pattern.CASE_INSENSITIVE); // eg doi.org/bfrhmx (no "10/") private static final Pattern EXACT_SHORT_DOI_PATT = Pattern.compile(SHORT_DOI_EXP_PREFIX + SHORT_DOI_EXP, Pattern.CASE_INSENSITIVE); private static final Pattern FIND_SHORT_DOI_PATT = Pattern.compile("(?:https?://[^\\s]+?)?" + FIND_SHORT_DOI_EXP, Pattern.CASE_INSENSITIVE); + + // See https://www.baeldung.com/java-regex-s-splus for explanation of \\s+ + // See https://stackoverflow.com/questions/3203190/regex-any-ascii-character for the regexp that includes ASCII characters only + // Another reference for regexp for ASCII characters: https://howtodoinjava.com/java/regex/java-clean-ascii-text-non-printable-chars/ + private static final String CHARS_TO_REMOVE = "[\\s+" // remove white space characters, i.e, \t, \n, \x0B, \f, \r . + is a greedy quantifier + + "[^\\x00-\\x7F]" // strips off all non-ASCII characters + + "]"; + // DOI private final String doi; // Short DOI @@ -151,8 +159,9 @@ public DOI(String doi) { */ public static Optional parse(String doi) { try { - String cleanedDOI = doi.trim(); - cleanedDOI = doi.replaceAll(" ", ""); + String cleanedDOI = doi; + cleanedDOI = cleanedDOI.replaceAll(CHARS_TO_REMOVE, ""); + return Optional.of(new DOI(cleanedDOI)); } catch (IllegalArgumentException | NullPointerException e) { return Optional.empty(); diff --git a/src/main/resources/l10n/JabRef_en.properties b/src/main/resources/l10n/JabRef_en.properties index a27e30f60a7..6371e0b8405 100644 --- a/src/main/resources/l10n/JabRef_en.properties +++ b/src/main/resources/l10n/JabRef_en.properties @@ -555,6 +555,8 @@ No\ journal\ names\ could\ be\ abbreviated.=No journal names could be abbreviate No\ journal\ names\ could\ be\ unabbreviated.=No journal names could be unabbreviated. +No\ DOI\ data\ exists=No DOI data exists + not=not not\ found=not found diff --git a/src/test/java/org/jabref/model/entry/identifier/DOITest.java b/src/test/java/org/jabref/model/entry/identifier/DOITest.java index b1f9e4e5762..b08e8470d50 100644 --- a/src/test/java/org/jabref/model/entry/identifier/DOITest.java +++ b/src/test/java/org/jabref/model/entry/identifier/DOITest.java @@ -118,6 +118,13 @@ private static Stream testData() { Arguments.of("https://doi.org/10.1109/VLHCC.2004.20", DOI.parse("https : / / doi.org / 10 .1109 /V LHCC.20 04.20").get().getURIAsASCIIString()), // parse short DOI with whitespace Arguments.of("https://doi.org/10/gf4gqc", DOI.parse("https : / / doi.org / 10 / gf4gqc").get().getURIAsASCIIString()), + // parse DOI with non-ASCII characters and whitespace + Arguments.of("https://doi.org/10/gf4gqc", DOI.parse("�https : \n ␛ / / doi.org / \t 10 / \r gf4gqc�␛").get().getURIAsASCIIString()), + Arguments.of("10/gf4gqc", DOI.parse("�https : \n ␛ / / doi.org / \t 10 / \r gf4gqc�␛").get().getDOI()), + Arguments.of("10/gf4gqc", DOI.parse(" 10 / gf4gqc ").get().getDOI()), + Arguments.of("10.3218/3846-0", DOI.parse(" �10.3218\n/384␛6-0�").get().getDOI()), + // parse already-cleaned DOI + Arguments.of("10.3218/3846-0", DOI.parse("10.3218/3846-0").get().getDOI()), // correctlyEncodeDOIs // See http://www.doi.org/doi_handbook/2_Numbering.html#2.5.2.4