From ff1fb17e0016c8d446a552afb09b6cd414003b5f Mon Sep 17 00:00:00 2001 From: Stefan Kolb Date: Mon, 20 Feb 2017 15:15:34 +0100 Subject: [PATCH] Fixes #2574 Add logic for new Sciencedirect pages --- .../logic/importer/fetcher/ScienceDirect.java | 23 +++++++++++++++---- .../importer/fetcher/ScienceDirectTest.java | 15 +++++++++++- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java b/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java index bb041776c9d7..672170b5cd24 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/ScienceDirect.java @@ -36,7 +36,6 @@ public class ScienceDirect implements FulltextFetcher { @Override public Optional findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); - Optional pdfLink = Optional.empty(); // Try unique DOI first Optional doi = entry.getField(FieldName.DOI).flatMap(DOI::build); @@ -46,21 +45,35 @@ public Optional findFullText(BibEntry entry) throws IOException { try { String sciLink = getUrlByDoi(doi.get().getDOI()); + // scrape the web page not as mobile client! if (!sciLink.isEmpty()) { - // Retrieve PDF link - Document html = Jsoup.connect(sciLink).ignoreHttpErrors(true).get(); + Document html = Jsoup.connect(sciLink) + .userAgent("Mozilla/5.0 (Windows; U; WindowsNT 5.1; en-US; rv1.8.1.6) Gecko/20070725 Firefox/2.0.0.6") + .referrer("http://www.google.com") + .ignoreHttpErrors(true).get(); + + // Retrieve PDF link (old page) Element link = html.getElementById("pdfLink"); if (link != null) { LOGGER.info("Fulltext PDF found @ ScienceDirect."); - pdfLink = Optional.of(new URL(link.attr("pdfurl"))); + Optional pdfLink = Optional.of(new URL(link.attr("pdfurl"))); + return pdfLink; + } + // Retrieve PDF link (new page) + String url = html.getElementsByClass("pdf-download-btn-link").attr("href"); + + if (url != null) { + LOGGER.info("Fulltext PDF found @ ScienceDirect."); + Optional pdfLink = Optional.of(new URL("http://www.sciencedirect.com" + url)); + return pdfLink; } } } catch(UnirestException e) { LOGGER.warn("ScienceDirect API request failed", e); } } - return pdfLink; + return Optional.empty(); } private String getUrlByDoi(String doi) throws UnirestException { diff --git a/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java b/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java index 56d8e9a114f1..b22021676875 100644 --- a/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java +++ b/src/test/java/org/jabref/logic/importer/fetcher/ScienceDirectTest.java @@ -38,7 +38,7 @@ public void doiNotPresent() throws IOException { } @Test - public void findByDOI() throws IOException { + public void findByDOIOldPage() throws IOException { // CI server is blocked Assume.assumeFalse(DevEnvironment.isCIServer()); @@ -50,6 +50,19 @@ public void findByDOI() throws IOException { ); } + @Test + public void findByDOINewPage() throws IOException { + // CI server is blocked + Assume.assumeFalse(DevEnvironment.isCIServer()); + + entry.setField("doi", "10.1016/j.aasri.2014.09.002"); + + Assert.assertEquals( + Optional.of(new URL("http://www.sciencedirect.com/science/article/pii/S2212671614001024/pdf?md5=4e2e9a369b4d5b3db5100aba599bef8b&pid=1-s2.0-S2212671614001024-main.pdf")), + finder.findFullText(entry) + ); + } + @Test public void notFoundByDOI() throws IOException { // CI server is blocked