Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix more fetchers #6790

Merged
merged 22 commits into from
Aug 26, 2020
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
- We fixed an issue where percent sign ('%') was not formatted properly by the HTML formatter [#6753](https://github.com/JabRef/jabref/issues/6753)
- We fixed an issue with the [SAO/NASA Astrophysics Data System](https://docs.jabref.org/collect/import-using-online-bibliographic-database/ads) fetcher where `\textbackslash` appeared at the end of the abstract.
- We fixed an issue with the Science Direct fetcher where PDFs could not be downloaded. Fixes [#5860](https://github.com/JabRef/jabref/issues/5860)
- We fixed an issue with the Library of Congress importer.

### Removed

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import org.jabref.model.entry.BibEntry;
import org.jabref.model.strings.StringUtil;

import org.slf4j.LoggerFactory;

/**
* Provides a convenient interface for search-based fetcher, which follow the usual three-step procedure:
* 1. Open a URL based on the search query
Expand All @@ -39,16 +41,22 @@ default List<BibEntry> performSearch(String query) throws FetcherException {
return Collections.emptyList();
}

try (InputStream stream = getUrlDownload(getURLForQuery(query)).asInputStream()) {
URL urlForQuery;
try {
urlForQuery = getURLForQuery(query);
koppor marked this conversation as resolved.
Show resolved Hide resolved
} catch (URISyntaxException | MalformedURLException e) {
LoggerFactory.getLogger(this.getClass()).info("Search URL {} is malformed", query);
koppor marked this conversation as resolved.
Show resolved Hide resolved
throw new FetcherException("Search URI is malformed", e);
}
try (InputStream stream = getUrlDownload(urlForQuery).asInputStream()) {
List<BibEntry> fetchedEntries = getParser().parseEntries(stream);

// Post-cleanup
fetchedEntries.forEach(this::doPostCleanup);

return fetchedEntries;
} catch (URISyntaxException e) {
throw new FetcherException("Search URI is malformed", e);
} catch (IOException e) {
LoggerFactory.getLogger(this.getClass()).info("IOException at URL {}", urlForQuery.toString());
koppor marked this conversation as resolved.
Show resolved Hide resolved
// TODO: Catch HTTP Response 401/403 errors and report that user has no rights to access resource
throw new FetcherException("A network error occurred", e);
} catch (ParseException e) {
Expand All @@ -65,12 +73,17 @@ default List<BibEntry> performSearch(String query) throws FetcherException {
*/
@Override
default List<BibEntry> performComplexSearch(ComplexSearchQuery complexSearchQuery) throws FetcherException {
try (InputStream stream = getUrlDownload(getComplexQueryURL(complexSearchQuery)).asInputStream()) {
URL complexQueryURL = null;
try {
complexQueryURL = getComplexQueryURL(complexSearchQuery);
} catch (URISyntaxException | MalformedURLException e) {
throw new FetcherException("Search URI is malformed", e);
}
LoggerFactory.getLogger(this.getClass()).debug("Using query URL {}", complexQueryURL.toString());
try (InputStream stream = getUrlDownload(complexQueryURL).asInputStream()) {
List<BibEntry> fetchedEntries = getParser().parseEntries(stream);
fetchedEntries.forEach(this::doPostCleanup);
return fetchedEntries;
} catch (URISyntaxException e) {
throw new FetcherException("Search URI is malformed", e);
} catch (IOException e) {
// TODO: Catch HTTP Response 401/403 errors and report that user has no rights to access resource
throw new FetcherException("A network error occurred", e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Arrays;

import org.jabref.logic.formatter.bibtexfields.RemoveDigitsFormatter;
import org.jabref.logic.formatter.bibtexfields.RemoveNewlinesFormatter;
Expand All @@ -14,7 +15,9 @@
import org.jabref.logic.importer.SearchBasedParserFetcher;
import org.jabref.model.cleanup.FieldFormatterCleanup;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.FieldFactory;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.field.UnknownField;

import org.apache.http.client.utils.URIBuilder;

Expand All @@ -31,10 +34,10 @@ public CollectionOfComputerScienceBibliographiesFetcher(ImportFormatPreferences
@Override
public URL getURLForQuery(String query) throws URISyntaxException, MalformedURLException, FetcherException {
return new URIBuilder(BASIC_SEARCH_URL)
.addParameter("query", query)
.addParameter("sort", "score")
.build()
.toURL();
.addParameter("query", query)
.addParameter("sort", "score")
.build()
.toURL();
}

@Override
Expand All @@ -53,5 +56,29 @@ public void doPostCleanup(BibEntry entry) {
new FieldFormatterCleanup(StandardField.ABSTRACT, new ReplaceTabsBySpaceFormater()).cleanup(entry);
new FieldFormatterCleanup(StandardField.ABSTRACT, new RemoveRedundantSpacesFormatter()).cleanup(entry);
new FieldFormatterCleanup(StandardField.EDITOR, new RemoveDigitsFormatter()).cleanup(entry);
// identifier fields is a key-value field
// example: "urn:isbn:978-1-4503-5217-8; doi:10.1145/3129790.3129810; ISI:000505046100032; Scopus 2-s2.0-85037741580"
// thus, key can contain multiple ":"; sometimes value seaprated by " " instead of ":"
UnknownField identifierField = new UnknownField("identifier");
entry.getField(identifierField)
.stream()
.flatMap(value -> Arrays.stream(value.split("; ")))
koppor marked this conversation as resolved.
Show resolved Hide resolved
.forEach(identifierKeyValue -> {
// check for pattern "Scopus 2-..."
String[] identifierKeyValueSplit = identifierKeyValue.split(" ");
if (identifierKeyValueSplit.length == 1) {
// check for pattern "doi:..."
identifierKeyValueSplit = identifierKeyValue.split(":");
}
int length = identifierKeyValueSplit.length;
if (length < 2) {
return;
}
// in the case "urn:isbn:", just "isbn" is used
String key = identifierKeyValueSplit[length - 2];
String value = identifierKeyValueSplit[length - 1];
entry.setField(FieldFactory.parseField(key), value);
koppor marked this conversation as resolved.
Show resolved Hide resolved
});
entry.clearField(identifierField);
}
}
73 changes: 39 additions & 34 deletions src/main/java/org/jabref/logic/importer/fetcher/GoogleScholar.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ public class GoogleScholar implements FulltextFetcher, SearchBasedFetcher {

private static final Pattern LINK_TO_BIB_PATTERN = Pattern.compile("(https:\\/\\/scholar.googleusercontent.com\\/scholar.bib[^\"]*)");

private static final String BASIC_SEARCH_URL = "https://scholar.google.com/scholar?";
private static final String SEARCH_IN_TITLE_URL = "https://scholar.google.com// scholar?";
private static final String BASIC_SEARCH_URL = "https://scholar.google.ch/scholar?";

private static final int NUM_RESULTS = 10;

Expand All @@ -66,10 +65,10 @@ public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherExc

try {
// title search
URIBuilder uriBuilder = new URIBuilder(SEARCH_IN_TITLE_URL);
URIBuilder uriBuilder = new URIBuilder(BASIC_SEARCH_URL);
uriBuilder.addParameter("as_q", "");
// as_epq as exact phrase
uriBuilder.addParameter("as_epq", entry.getField(StandardField.TITLE).orElse(null));
uriBuilder.addParameter("as_epq", entry.getField(StandardField.TITLE).orElse(""));
// as_occt field to search in
uriBuilder.addParameter("as_occt", "title");

Expand Down Expand Up @@ -131,33 +130,36 @@ public Optional<HelpFile> getHelpPage() {
public List<BibEntry> performSearch(String query) throws FetcherException {
try {
obtainAndModifyCookie();
List<BibEntry> foundEntries = new ArrayList<>(10);
List<BibEntry> foundEntries = new ArrayList<>(20);

URIBuilder uriBuilder = new URIBuilder(BASIC_SEARCH_URL);
uriBuilder.addParameter("hl", "en");
uriBuilder.addParameter("btnG", "Search");
uriBuilder.addParameter("q", query);

addHitsFromQuery(foundEntries, uriBuilder.toString());

if (foundEntries.size() == 10) {
uriBuilder.addParameter("start", "10");
try {
addHitsFromQuery(foundEntries, uriBuilder.toString());

if (foundEntries.size() == 10) {
koppor marked this conversation as resolved.
Show resolved Hide resolved
uriBuilder.addParameter("start", "10");
addHitsFromQuery(foundEntries, uriBuilder.toString());
}
} catch (IOException e) {
koppor marked this conversation as resolved.
Show resolved Hide resolved
LOGGER.info("IOException for URL {}", uriBuilder.toString());
// if there are too much requests from the same IP adress google is answering with a 503 and redirecting to a captcha challenge
// The caught IOException looks for example like this:
// java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0
if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) {
throw new FetcherException("Fetching from Google Scholar failed.",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e);
} else {
throw new FetcherException("Error while fetching from " + getName(), e);
}
}

return foundEntries;
} catch (URISyntaxException e) {
throw new FetcherException("Error while fetching from " + getName(), e);
} catch (IOException e) {
// if there are too much requests from the same IP adress google is answering with a 503 and redirecting to a captcha challenge
// The caught IOException looks for example like this:
// java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0
if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) {
throw new FetcherException("Fetching from Google Scholar failed.",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e);
} else {
throw new FetcherException("Error while fetching from " + getName(), e);
}
}
}

Expand All @@ -178,26 +180,28 @@ public List<BibEntry> performComplexSearch(ComplexSearchQuery complexSearchQuery
uriBuilder.addParameter("as_yhi", year.toString());
});

addHitsFromQuery(foundEntries, uriBuilder.toString());

if (foundEntries.size() == 10) {
uriBuilder.addParameter("start", "10");
try {
addHitsFromQuery(foundEntries, uriBuilder.toString());
}

if (foundEntries.size() == 10) {
uriBuilder.addParameter("start", "10");
addHitsFromQuery(foundEntries, uriBuilder.toString());
}
} catch (IOException e) {
LOGGER.info("IOException for URL {}", uriBuilder.toString());
// if there are too much requests from the same IP adress google is answering with a 503 and redirecting to a captcha challenge
// The caught IOException looks for example like this:
// java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0
if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) {
throw new FetcherException("Fetching from Google Scholar failed.",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e);
} else {
throw new FetcherException("Error while fetching from " + getName(), e);
}
}
return foundEntries;
} catch (URISyntaxException e) {
throw new FetcherException("Error while fetching from " + getName(), e);
} catch (IOException e) {
// if there are too much requests from the same IP adress google is answering with a 503 and redirecting to a captcha challenge
// The caught IOException looks for example like this:
// java.io.IOException: Server returned HTTP response code: 503 for URL: https://ipv4.google.com/sorry/index?continue=https://scholar.google.com/scholar%3Fhl%3Den%26btnG%3DSearch%26q%3Dbpmn&hl=en&q=CGMSBI0NBDkYuqy9wAUiGQDxp4NLQCWbIEY1HjpH5zFJhv4ANPGdWj0
if (e.getMessage().contains("Server returned HTTP response code: 503 for URL")) {
throw new FetcherException("Fetching from Google Scholar failed.",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), e);
} else {
throw new FetcherException("Error while fetching from " + getName(), e);
}
}
}

Expand All @@ -215,6 +219,7 @@ private void addHitsFromQuery(List<BibEntry> entryList, String queryURL) throws
String content = new URLDownload(queryURL).asString();

if (needsCaptcha(content)) {
LOGGER.info("Captcha hit at {}", queryURL);
koppor marked this conversation as resolved.
Show resolved Hide resolved
throw new FetcherException("Fetching from Google Scholar failed.",
Localization.lang("This might be caused by reaching the traffic limitation of Google Scholar (see 'Help' for details)."), null);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ private String constructComplexQueryString(ComplexSearchQuery complexSearchQuery
complexSearchQuery.getAuthors().ifPresent(authors -> authors.forEach(author -> searchTerms.add("name:" + author)));
complexSearchQuery.getTitlePhrases().ifPresent(titlePhrases -> titlePhrases.forEach(title -> searchTerms.add("title:" + title)));
complexSearchQuery.getJournal().ifPresent(journal -> searchTerms.add("journal:" + journal));
// Since Springer API does not support year range search we ignore formYear and toYear.
// Since Springer API does not support year range search, we ignore formYear and toYear and use "singleYear" only
complexSearchQuery.getSingleYear().ifPresent(year -> searchTerms.add("year:" + year.toString()));
complexSearchQuery.getDefaultField().ifPresent(defaultField -> searchTerms.add(defaultField));
return String.join(" AND ", searchTerms);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import java.util.Optional;

import org.jabref.logic.importer.FulltextFetcher;
import org.jabref.logic.util.BuildInfo;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.identifier.DOI;
Expand All @@ -27,7 +28,7 @@ public class SpringerLink implements FulltextFetcher {
private static final Logger LOGGER = LoggerFactory.getLogger(SpringerLink.class);

private static final String API_URL = "https://api.springer.com/meta/v1/json";
private static final String API_KEY = "a98b4a55181ffcd27259bea45edad12e";
private static final String API_KEY = new BuildInfo().springerNatureAPIKey;
private static final String CONTENT_HOST = "link.springer.com";

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ private void putDate(Map<Field, String> fields, String elementName, DateDefiniti

case "dateIssued":
// The first 4 digits of dateIssued should be the year
fields.put(StandardField.YEAR, date.getValue().substring(0, 4));
fields.put(StandardField.YEAR, date.getValue().replaceAll("[^0-9]*", "").replaceAll("\\(\\d?\\d?\\d?\\d?.*\\)", "\1"));
break;
case "dateCreated":
// If there was no year in date issued, then take the year from date created
Expand Down Expand Up @@ -435,7 +435,9 @@ private void handleAuthorsInNamePart(NameDefinition name, List<String> authors,
NamePartDefinition namePart = (NamePartDefinition) value;
String type = namePart.getAtType();
if ((type == null) && (namePart.getValue() != null)) {
authors.add(namePart.getValue());
String namePartValue = namePart.getValue();
namePartValue = namePartValue.replaceAll(",$", "");
authors.add(namePartValue);
} else if ("family".equals(type) && (namePart.getValue() != null)) {
// family should come first, so if family appears we can set the author then comes before
// we have to check if forename and family name are not empty in case it's the first author
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/jabref/logic/net/URLDownload.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
*/
public class URLDownload {

public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0";
public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0";

private static final Logger LOGGER = LoggerFactory.getLogger(URLDownload.class);
private final URL source;
Expand Down
16 changes: 6 additions & 10 deletions src/test/java/org/jabref/logic/importer/fetcher/CiteSeerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,19 @@
import org.jabref.model.entry.types.StandardEntryType;
import org.jabref.testutils.category.FetcherTest;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

@FetcherTest
class CiteSeerTest {

CiteSeer fetcher;

@BeforeEach
void setUp() throws Exception {
fetcher = new CiteSeer();
}
private CiteSeer fetcher = new CiteSeer();

@Test
void searchByQueryFindsEntry() throws Exception {
@Disabled("CiteseerX currently has issues with ncites query")
void searchByQueryFindsEntryRigorousDerivation() throws Exception {
BibEntry expected = new BibEntry(StandardEntryType.Misc)
.withField(StandardField.AUTHOR, "Wang Wei and Zhang Pingwen and Zhang Zhifei")
.withField(StandardField.TITLE, "Rigorous Derivation from Landau-de Gennes Theory to Eericksen-leslie Theory")
Expand All @@ -35,13 +31,13 @@ void searchByQueryFindsEntry() throws Exception {
}

@Test
void searchByQueryFindsEntry2() throws Exception {
void searchByQueryFindsEntryCopingTheoryAndResearch() throws Exception {
BibEntry expected = new BibEntry(StandardEntryType.Misc)
.withField(StandardField.AUTHOR, "Lazarus Richard S.")
.withField(StandardField.TITLE, "Coping Theory and Research: Past Present and Future")
.withField(StandardField.DOI, "10.1.1.115.9665")
.withField(StandardField.YEAR, "1993")
.withField(StandardField.JOURNAL, "PSYCHOSOMATIC MEDICINE");
.withField(StandardField.JOURNALTITLE, "PSYCHOSOMATIC MEDICINE");

List<BibEntry> fetchedEntries = fetcher.performSearch("doi:10.1.1.115.9665");
assertEquals(Collections.singletonList(expected), fetchedEntries);
Expand Down
Loading