-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix inspire fetcher #6258
Fix inspire fetcher #6258
Changes from 2 commits
bac8bcf
1244d7d
4d57861
b503afe
c600a58
8a366c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,49 +1,46 @@ | ||
package org.jabref.logic.importer.fetcher; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.InputStreamReader; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.net.MalformedURLException; | ||
import java.net.URISyntaxException; | ||
import java.net.URL; | ||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Optional; | ||
import java.util.stream.Collectors; | ||
|
||
import org.jabref.logic.formatter.bibtexfields.ClearFormatter; | ||
import org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter; | ||
import org.jabref.logic.help.HelpFile; | ||
import org.jabref.logic.importer.FetcherException; | ||
import org.jabref.logic.importer.ImportFormatPreferences; | ||
import org.jabref.logic.importer.ParseException; | ||
import org.jabref.logic.importer.Parser; | ||
import org.jabref.logic.importer.SearchBasedParserFetcher; | ||
import org.jabref.logic.importer.fileformat.BibtexParser; | ||
import org.jabref.logic.util.OS; | ||
import org.jabref.logic.importer.util.MediaTypes; | ||
import org.jabref.logic.net.URLDownload; | ||
import org.jabref.model.cleanup.FieldFormatterCleanup; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.field.StandardField; | ||
import org.jabref.model.entry.field.UnknownField; | ||
import org.jabref.model.strings.StringUtil; | ||
import org.jabref.model.util.DummyFileUpdateMonitor; | ||
|
||
import org.apache.http.client.utils.URIBuilder; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
|
||
/** | ||
* Fetches data from the INSPIRE database. | ||
* | ||
* @implNote We just use the normal search interface since it provides direct BibTeX export while the API (http://inspirehep.net/info/hep/api) currently only supports JSON and XML | ||
*/ | ||
public class INSPIREFetcher implements SearchBasedParserFetcher { | ||
|
||
private static final String INSPIRE_HOST = "https://inspirehep.net/search"; | ||
private static final String INSPIRE_HOST = "https://inspirehep.net/api/literature/"; | ||
|
||
private final ImportFormatPreferences preferences; | ||
private final ImportFormatPreferences importFormatPreferences; | ||
|
||
public INSPIREFetcher(ImportFormatPreferences preferences) { | ||
this.preferences = preferences; | ||
this.importFormatPreferences = preferences; | ||
} | ||
|
||
@Override | ||
|
@@ -59,33 +56,38 @@ public Optional<HelpFile> getHelpPage() { | |
@Override | ||
public URL getURLForQuery(String query) throws URISyntaxException, MalformedURLException, FetcherException { | ||
URIBuilder uriBuilder = new URIBuilder(INSPIRE_HOST); | ||
uriBuilder.addParameter("p", query); // Query | ||
//uriBuilder.addParameter("jrec", "1"); // Start index (not needed at the moment) | ||
uriBuilder.addParameter("rg", "100"); // Should return up to 100 items (instead of default 25) | ||
uriBuilder.addParameter("of", "hx"); // BibTeX format | ||
uriBuilder.addParameter("q", query); // Query | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there is no paging support (start + size) for the new interface? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The json api has support for it, but when calling with the bibtex header not. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On my end There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed, this seems to work. I will create a follow up PR |
||
return uriBuilder.build().toURL(); | ||
} | ||
|
||
@Override | ||
public Parser getParser() { | ||
// Inspire returns the BibTeX result embedded in HTML | ||
// So we extract the BibTeX string from the <pre>bibtex</pre> tags and pass the content to the BibTeX parser | ||
return inputStream -> { | ||
String response = new BufferedReader(new InputStreamReader(inputStream)).lines().collect(Collectors.joining(OS.NEWLINE)); | ||
public List<BibEntry> performSearch(String query) throws FetcherException { | ||
if (StringUtil.isBlank(query)) { | ||
Siedlerchr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return Collections.emptyList(); | ||
} | ||
|
||
try { | ||
URLDownload download = new URLDownload(getURLForQuery(query)); | ||
download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX); | ||
|
||
List<BibEntry> entries = new ArrayList<>(); | ||
try (InputStream stream = download.asInputStream()) { | ||
List<BibEntry> fetchedEntries = getParser().parseEntries(stream); | ||
|
||
Document doc = Jsoup.parse(response); | ||
Elements preElements = doc.getElementsByTag("pre"); | ||
// Post-cleanup | ||
fetchedEntries.forEach(this::doPostCleanup); | ||
|
||
for (Element elem : preElements) { | ||
//We have to use a new instance here, because otherwise only the first entry gets parsed | ||
BibtexParser bibtexParser = new BibtexParser(preferences, new DummyFileUpdateMonitor()); | ||
List<BibEntry> entry = bibtexParser.parseEntries(elem.text()); | ||
entries.addAll(entry); | ||
return fetchedEntries; | ||
} | ||
return entries; | ||
}; | ||
|
||
} catch (URISyntaxException e) { | ||
throw new FetcherException("Search URI is malformed", e); | ||
} catch (IOException e) { | ||
// TODO: Catch HTTP Response 401/403 errors and report that user has no rights to access resource | ||
throw new FetcherException("A network error occurred", e); | ||
} catch (ParseException e) { | ||
throw new FetcherException("An internal parser error occurred", e); | ||
} | ||
|
||
} | ||
|
||
@Override | ||
|
@@ -96,4 +98,9 @@ public void doPostCleanup(BibEntry entry) { | |
// Remove braces around content of "title" field | ||
new FieldFormatterCleanup(StandardField.TITLE, new RemoveBracesFormatter()).cleanup(entry); | ||
} | ||
|
||
@Override | ||
public Parser getParser() { | ||
return new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor()); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you please include the link to the documentation of the new api as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is not yet any documentation avaiable. I contacted INSPIRE hep team.