From 67ba8d59bf2cebaf18a76a9abafac9c1eb20de36 Mon Sep 17 00:00:00 2001 From: Tobias Diez Date: Fri, 13 Mar 2020 21:39:50 +0100 Subject: [PATCH] Improve arXiv fetcher No longer include the version string in the `eprint` field, as wished in https://discourse.jabref.org/t/remove-version-in-arxiv-import/1941. Also improved the arXiv identifier parser a bit. --- CHANGELOG.md | 2 + .../jabref/logic/importer/fetcher/ArXiv.java | 25 ++------ .../entry/identifier/ArXivIdentifier.java | 46 +++++++++----- .../logic/importer/fetcher/ArXivTest.java | 60 +++++++++---------- .../entry/identifier/ArXivIdentifierTest.java | 32 +++++++++- 5 files changed, 98 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bfd09f6aaa0..727f5c0d5ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve ### Changed +- We improved the arXiv fetcher. Now it should find entries even more reliably and does no longer include the version (e.g `v1`) in the `eprint` field. [forum#1941](https://discourse.jabref.org/t/remove-version-in-arxiv-import/1941) + ### Fixed ### Removed diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java b/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java index 9dc7da1acf0..de3539d8184 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java @@ -10,8 +10,6 @@ import java.util.List; import java.util.Objects; import java.util.Optional; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.stream.Collectors; import javax.xml.parsers.DocumentBuilder; @@ -59,8 +57,6 @@ public class ArXiv implements FulltextFetcher, SearchBasedFetcher, IdBasedFetche private static final Logger LOGGER = LoggerFactory.getLogger(ArXiv.class); private static final String API_URL = "https://export.arxiv.org/api/query"; - private static final String ARXIV_URL_PREFIX_FOR_ID = "(https?://arxiv.org/abs/)"; - private static final Pattern URL_PATTERN = Pattern.compile(ARXIV_URL_PREFIX_FOR_ID); private final ImportFormatPreferences importFormatPreferences; @@ -106,7 +102,7 @@ private Optional searchForEntry(String searchQuery) throws FetcherEx private Optional searchForEntryById(String id) throws FetcherException { Optional identifier = ArXivIdentifier.parse(id); - if (!identifier.isPresent()) { + if (identifier.isEmpty()) { return Optional.empty(); } @@ -263,10 +259,8 @@ public List performSearch(String query) throws FetcherException { @Override public Optional performSearchById(String identifier) throws FetcherException { - String cleanedIdentifier = identifier.replaceAll(" ", ""); - cleanedIdentifier = ArXivEntry.createIdString(cleanedIdentifier); - - return searchForEntryById(cleanedIdentifier).map((arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator())); + return searchForEntryById(identifier) + .map((arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator())); } @Override @@ -372,18 +366,7 @@ public Optional getPdfUrl() { * Returns the arXiv identifier */ public Optional getIdString() { - return urlAbstractPage.map(ArXivEntry::createIdString); - } - - public static String createIdString(String id) { - Matcher matcher = URL_PATTERN.matcher(id); - if (matcher.find()) { - // Remove leading http(s)://arxiv.org/abs/ from abstract url to get arXiv ID - return id.substring(matcher.group(1).length()); - } else { - return id; - } - + return urlAbstractPage.flatMap(ArXivIdentifier::parse).map(ArXivIdentifier::getNormalizedWithoutVersion); } public Optional getId() { diff --git a/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java b/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java index 2b701a81984..87d3dcc333a 100644 --- a/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java +++ b/src/main/java/org/jabref/model/entry/identifier/ArXivIdentifier.java @@ -9,49 +9,59 @@ import org.jabref.model.entry.field.Field; import org.jabref.model.entry.field.StandardField; +import org.jabref.model.strings.StringUtil; /** * Identifier for the arXiv. See https://arxiv.org/help/arxiv_identifier */ public class ArXivIdentifier implements Identifier { + private static final String ARXIV_PREFIX = "http(s)?://arxiv.org/(abs|pdf)/|arxiv|arXiv"; private final String identifier; private final String classification; + private final String version; ArXivIdentifier(String identifier) { - this(identifier, ""); + this(identifier, "", ""); } ArXivIdentifier(String identifier, String classification) { + this(identifier, "", classification); + } + + ArXivIdentifier(String identifier, String version, String classification) { this.identifier = identifier.trim(); + this.version = version.trim(); this.classification = classification.trim(); } public static Optional parse(String value) { - Pattern identifierPattern = Pattern.compile("(arxiv|arXiv)?\\s?:?\\s?(?\\d{4}.\\d{4,5}(v\\d+)?)\\s?(\\[(?\\S+)\\])?"); - Matcher identifierMatcher = identifierPattern.matcher(value); + String identifier = value.replaceAll(" ", ""); + Pattern identifierPattern = Pattern.compile("(" + ARXIV_PREFIX + ")?\\s?:?\\s?(?\\d{4}.\\d{4,5})(v(?\\d+))?\\s?(\\[(?\\S+)\\])?"); + Matcher identifierMatcher = identifierPattern.matcher(identifier); if (identifierMatcher.matches()) { String id = identifierMatcher.group("id"); String classification = identifierMatcher.group("classification"); if (classification == null) { classification = ""; } - return Optional.of(new ArXivIdentifier(id, classification)); + String version = identifierMatcher.group("version"); + if (version == null) { + version = ""; + } + return Optional.of(new ArXivIdentifier(id, version, classification)); } - Pattern oldIdentifierPattern = Pattern.compile("(arxiv|arXiv)?\\s?:?\\s?(?(?[a-z\\-]+(\\.[A-Z]{2})?)/\\d{7})"); - Matcher oldIdentifierMatcher = oldIdentifierPattern.matcher(value); + Pattern oldIdentifierPattern = Pattern.compile("(" + ARXIV_PREFIX + ")?\\s?:?\\s?(?(?[a-z\\-]+(\\.[A-Z]{2})?)/\\d{7})(v(?\\d+))?"); + Matcher oldIdentifierMatcher = oldIdentifierPattern.matcher(identifier); if (oldIdentifierMatcher.matches()) { String id = oldIdentifierMatcher.group("id"); String classification = oldIdentifierMatcher.group("classification"); - return Optional.of(new ArXivIdentifier(id, classification)); - } - - Pattern urlPattern = Pattern.compile("(http://arxiv.org/abs/)(?\\S+)"); - Matcher urlMatcher = urlPattern.matcher(value); - if (urlMatcher.matches()) { - String id = urlMatcher.group("id"); - return Optional.of(new ArXivIdentifier(id)); + String version = oldIdentifierMatcher.group("version"); + if (version == null) { + version = ""; + } + return Optional.of(new ArXivIdentifier(id, version, classification)); } return Optional.empty(); @@ -99,6 +109,14 @@ public Field getDefaultField() { @Override public String getNormalized() { + if (StringUtil.isNotBlank(version)) { + return identifier + "v" + version; + } else { + return identifier; + } + } + + public String getNormalizedWithoutVersion() { return identifier; } diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java index 572d6681164..523cf9f651a 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/ArXivTest.java @@ -22,14 +22,14 @@ import static org.mockito.Mockito.when; @FetcherTest -public class ArXivTest { +class ArXivTest { private ArXiv finder; private BibEntry entry; private BibEntry sliceTheoremPaper; @BeforeEach - public void setUp() { + void setUp() { ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class); when(importFormatPreferences.getKeywordSeparator()).thenReturn(','); finder = new ArXiv(importFormatPreferences); @@ -41,7 +41,7 @@ public void setUp() { sliceTheoremPaper.setField(StandardField.TITLE, "Slice theorem for Fréchet group actions and covariant symplectic field theory"); sliceTheoremPaper.setField(StandardField.DATE, "2014-05-09"); sliceTheoremPaper.setField(StandardField.ABSTRACT, "A general slice theorem for the action of a Fr\\'echet Lie group on a Fr\\'echet manifolds is established. The Nash-Moser theorem provides the fundamental tool to generalize the result of Palais to this infinite-dimensional setting. The presented slice theorem is illustrated by its application to gauge theories: the action of the gauge transformation group admits smooth slices at every point and thus the gauge orbit space is stratified by Fr\\'echet manifolds. Furthermore, a covariant and symplectic formulation of classical field theory is proposed and extensively discussed. At the root of this novel framework is the incorporation of field degrees of freedom F and spacetime M into the product manifold F * M. The induced bigrading of differential forms is used in order to carry over the usual symplectic theory to this new setting. The examples of the Klein-Gordon field and general Yang-Mills theory illustrate that the presented approach conveniently handles the occurring symmetries."); - sliceTheoremPaper.setField(StandardField.EPRINT, "1405.2249v1"); + sliceTheoremPaper.setField(StandardField.EPRINT, "1405.2249"); sliceTheoremPaper.setField(StandardField.FILE, ":http\\://arxiv.org/pdf/1405.2249v1:PDF"); sliceTheoremPaper.setField(StandardField.EPRINTTYPE, "arXiv"); sliceTheoremPaper.setField(StandardField.EPRINTCLASS, "math-ph"); @@ -49,17 +49,17 @@ public void setUp() { } @Test - public void findFullTextForEmptyEntryResultsEmptyOptional() throws IOException { + void findFullTextForEmptyEntryResultsEmptyOptional() throws IOException { assertEquals(Optional.empty(), finder.findFullText(entry)); } @Test - public void findFullTextRejectsNullParameter() { + void findFullTextRejectsNullParameter() { assertThrows(NullPointerException.class, () -> finder.findFullText(null)); } @Test - public void findFullTextByDOI() throws IOException { + void findFullTextByDOI() throws IOException { entry.setField(StandardField.DOI, "10.1529/biophysj.104.047340"); entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping"); @@ -67,19 +67,19 @@ public void findFullTextByDOI() throws IOException { } @Test - public void findFullTextByEprint() throws IOException { + void findFullTextByEprint() throws IOException { entry.setField(StandardField.EPRINT, "1603.06570"); assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry)); } @Test - public void findFullTextByEprintWithPrefix() throws IOException { + void findFullTextByEprintWithPrefix() throws IOException { entry.setField(StandardField.EPRINT, "arXiv:1603.06570"); assertEquals(Optional.of(new URL("http://arxiv.org/pdf/1603.06570v1")), finder.findFullText(entry)); } @Test - public void findFullTextByEprintWithUnknownDOI() throws IOException { + void findFullTextByEprintWithUnknownDOI() throws IOException { entry.setField(StandardField.DOI, "10.1529/unknown"); entry.setField(StandardField.EPRINT, "1603.06570"); @@ -87,14 +87,14 @@ public void findFullTextByEprintWithUnknownDOI() throws IOException { } @Test - public void findFullTextByTitle() throws IOException { + void findFullTextByTitle() throws IOException { entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping"); assertEquals(Optional.of(new URL("http://arxiv.org/pdf/cond-mat/0406246v1")), finder.findFullText(entry)); } @Test - public void findFullTextByTitleAndPartOfAuthor() throws IOException { + void findFullTextByTitleAndPartOfAuthor() throws IOException { entry.setField(StandardField.TITLE, "Pause Point Spectra in DNA Constant-Force Unzipping"); entry.setField(StandardField.AUTHOR, "Weeks and Lucks"); @@ -102,19 +102,19 @@ public void findFullTextByTitleAndPartOfAuthor() throws IOException { } @Test - public void notFindFullTextByUnknownDOI() throws IOException { + void notFindFullTextByUnknownDOI() throws IOException { entry.setField(StandardField.DOI, "10.1529/unknown"); assertEquals(Optional.empty(), finder.findFullText(entry)); } @Test - public void notFindFullTextByUnknownId() throws IOException { + void notFindFullTextByUnknownId() throws IOException { entry.setField(StandardField.EPRINT, "1234.12345"); assertEquals(Optional.empty(), finder.findFullText(entry)); } @Test - public void findFullTextByDOINotAvailableInCatalog() throws IOException { + void findFullTextByDOINotAvailableInCatalog() throws IOException { entry.setField(StandardField.DOI, "10.1016/0370-2693(77)90015-6"); entry.setField(StandardField.TITLE, "Superspace formulation of supergravity"); @@ -122,26 +122,26 @@ public void findFullTextByDOINotAvailableInCatalog() throws IOException { } @Test - public void searchEntryByPartOfTitle() throws Exception { + void searchEntryByPartOfTitle() throws Exception { assertEquals(Collections.singletonList(sliceTheoremPaper), finder.performSearch("ti:\"slice theorem for Frechet\"")); } @Test - public void searchEntryByPartOfTitleWithAcuteAccent() throws Exception { + void searchEntryByPartOfTitleWithAcuteAccent() throws Exception { assertEquals(Collections.singletonList(sliceTheoremPaper), finder.performSearch("ti:\"slice theorem for Fréchet\"")); } @Test - public void searchEntryByOldId() throws Exception { + void searchEntryByOldId() throws Exception { BibEntry expected = new BibEntry(); expected.setType(StandardEntryType.Article); expected.setField(StandardField.AUTHOR, "H1 Collaboration"); expected.setField(StandardField.TITLE, "Multi-Electron Production at High Transverse Momenta in ep Collisions at HERA"); expected.setField(StandardField.DATE, "2003-07-07"); expected.setField(StandardField.ABSTRACT, "Multi-electron production is studied at high electron transverse momentum in positron- and electron-proton collisions using the H1 detector at HERA. The data correspond to an integrated luminosity of 115 pb-1. Di-electron and tri-electron event yields are measured. Cross sections are derived in a restricted phase space region dominated by photon-photon collisions. In general good agreement is found with the Standard Model predictions. However, for electron pair invariant masses above 100 GeV, three di-electron events and three tri-electron events are observed, compared to Standard Model expectations of 0.30 \\pm 0.04 and 0.23 \\pm 0.04, respectively."); - expected.setField(StandardField.EPRINT, "hep-ex/0307015v1"); + expected.setField(StandardField.EPRINT, "hep-ex/0307015"); expected.setField(StandardField.FILE, ":http\\://arxiv.org/pdf/hep-ex/0307015v1:PDF"); expected.setField(StandardField.EPRINTTYPE, "arXiv"); expected.setField(StandardField.EPRINTCLASS, "hep-ex"); @@ -153,61 +153,61 @@ public void searchEntryByOldId() throws Exception { } @Test - public void searchEntryByIdWith4DigitsAndVersion() throws Exception { + void searchEntryByIdWith4DigitsAndVersion() throws Exception { assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("1405.2249v1")); } @Test - public void searchEntryByIdWith4Digits() throws Exception { + void searchEntryByIdWith4Digits() throws Exception { assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("1405.2249")); } @Test - public void searchEntryByIdWith4DigitsAndPrefix() throws Exception { + void searchEntryByIdWith4DigitsAndPrefix() throws Exception { assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("arXiv:1405.2249")); } @Test - public void searchEntryByIdWith4DigitsAndPrefixAndNotTrimmed() throws Exception { + void searchEntryByIdWith4DigitsAndPrefixAndNotTrimmed() throws Exception { assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("arXiv : 1405. 2249")); } @Test - public void searchEntryByIdWith5Digits() throws Exception { + void searchEntryByIdWith5Digits() throws Exception { assertEquals(Optional.of( "An Optimal Convergence Theorem for Mean Curvature Flow of Arbitrary Codimension in Hyperbolic Spaces"), finder.performSearchById("1503.06747").flatMap(entry -> entry.getField(StandardField.TITLE))); } @Test - public void searchWithMalformedIdThrowsException() throws Exception { + void searchWithMalformedIdThrowsException() throws Exception { assertThrows(FetcherException.class, () -> finder.performSearchById("123412345")); } @Test - public void searchIdentifierForSlicePaper() throws Exception { + void searchIdentifierForSlicePaper() throws Exception { sliceTheoremPaper.clearField(StandardField.EPRINT); - assertEquals(ArXivIdentifier.parse("1405.2249v1"), finder.findIdentifier(sliceTheoremPaper)); + assertEquals(ArXivIdentifier.parse("1405.2249"), finder.findIdentifier(sliceTheoremPaper)); } @Test - public void searchEmptyId() throws Exception { + void searchEmptyId() throws Exception { assertEquals(Optional.empty(), finder.performSearchById("")); } @Test - public void searchWithHttpUrl() throws Exception { + void searchWithHttpUrl() throws Exception { assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("http://arxiv.org/abs/1405.2249")); } @Test - public void searchWithHttpsUrl() throws Exception { + void searchWithHttpsUrl() throws Exception { assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("https://arxiv.org/abs/1405.2249")); } @Test - public void searchWithHttpsUrlNotTrimmed() throws Exception { + void searchWithHttpsUrlNotTrimmed() throws Exception { assertEquals(Optional.of(sliceTheoremPaper), finder.performSearchById("https : // arxiv . org / abs / 1405 . 2249 ")); } } diff --git a/src/test/java/org/jabref/model/entry/identifier/ArXivIdentifierTest.java b/src/test/java/org/jabref/model/entry/identifier/ArXivIdentifierTest.java index 0c9250c5529..cbcfa455e2b 100644 --- a/src/test/java/org/jabref/model/entry/identifier/ArXivIdentifierTest.java +++ b/src/test/java/org/jabref/model/entry/identifier/ArXivIdentifierTest.java @@ -33,14 +33,14 @@ void parseWithArxivPrefix() throws Exception { void parseWithClassification() throws Exception { Optional parsed = ArXivIdentifier.parse("0706.0001v1 [q-bio.CB]"); - assertEquals(Optional.of(new ArXivIdentifier("0706.0001v1", "q-bio.CB")), parsed); + assertEquals(Optional.of(new ArXivIdentifier("0706.0001", "1", "q-bio.CB")), parsed); } @Test void parseWithArXivPrefixAndClassification() throws Exception { Optional parsed = ArXivIdentifier.parse("arXiv:0706.0001v1 [q-bio.CB]"); - assertEquals(Optional.of(new ArXivIdentifier("0706.0001v1", "q-bio.CB")), parsed); + assertEquals(Optional.of(new ArXivIdentifier("0706.0001", "1", "q-bio.CB")), parsed); } @Test @@ -63,4 +63,32 @@ void parseUrl() throws Exception { assertEquals(Optional.of(new ArXivIdentifier("1502.05795", "")), parsed); } + + @Test + void parseHttpsUrl() throws Exception { + Optional parsed = ArXivIdentifier.parse("https://arxiv.org/abs/1502.05795"); + + assertEquals(Optional.of(new ArXivIdentifier("1502.05795", "")), parsed); + } + + @Test + void parsePdfUrl() throws Exception { + Optional parsed = ArXivIdentifier.parse("http://arxiv.org/pdf/1502.05795"); + + assertEquals(Optional.of(new ArXivIdentifier("1502.05795", "")), parsed); + } + + @Test + void parseUrlWithVersion() throws Exception { + Optional parsed = ArXivIdentifier.parse("http://arxiv.org/abs/1502.05795v1"); + + assertEquals(Optional.of(new ArXivIdentifier("1502.05795", "1", "")), parsed); + } + + @Test + void parseOldUrlWithVersion() throws Exception { + Optional parsed = ArXivIdentifier.parse("http://arxiv.org/pdf/hep-ex/0307015v1"); + + assertEquals(Optional.of(new ArXivIdentifier("hep-ex/0307015", "1", "hep-ex")), parsed); + } }