Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #10498 Create Fetcher and Transformer for ScholarArchive #10549

Merged
merged 13 commits into from
Nov 6, 2023
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv

- We added a dropdown menu to let users change the reference library during AUX file import. [#10472](https://github.com/JabRef/jabref/issues/10472)
- We added a button to let users reset the cite command to the default value. [#10569](https://github.com/JabRef/jabref/issues/10569)
- We added [scholar.archive.org](https://scholar.archive.org/) as a new fetcher. [#10498](https://github.com/JabRef/jabref/issues/10498)

### Changed

Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/jabref/logic/importer/WebFetchers.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.jabref.logic.importer.fetcher.OpenAccessDoi;
import org.jabref.logic.importer.fetcher.ResearchGate;
import org.jabref.logic.importer.fetcher.RfcFetcher;
import org.jabref.logic.importer.fetcher.ScholarArchiveFetcher;
import org.jabref.logic.importer.fetcher.ScienceDirect;
import org.jabref.logic.importer.fetcher.SemanticScholar;
import org.jabref.logic.importer.fetcher.SpringerFetcher;
Expand Down Expand Up @@ -126,6 +127,7 @@ public static SortedSet<SearchBasedFetcher> getSearchBasedFetchers(ImportFormatP
set.add(new ResearchGate(importFormatPreferences));
set.add(new BiodiversityLibrary(importerPreferences));
set.add(new LOBIDFetcher(importerPreferences));
set.add(new ScholarArchiveFetcher());
return set;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package org.jabref.logic.importer.fetcher;

import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.IntStream;

import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.PagedSearchBasedParserFetcher;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.fetcher.transformers.ScholarArchiveQueryTransformer;
import org.jabref.logic.importer.util.JsonReader;
import org.jabref.logic.net.URLDownload;
import org.jabref.model.entry.AuthorList;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.types.EntryType;
import org.jabref.model.entry.types.StandardEntryType;

import jakarta.ws.rs.core.MediaType;
import kong.unirest.json.JSONArray;
import kong.unirest.json.JSONException;
import kong.unirest.json.JSONObject;
import org.apache.http.client.utils.URIBuilder;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ScholarArchiveFetcher implements PagedSearchBasedParserFetcher {

public static final String FETCHER_NAME = "ScholarArchive";

private static final Logger LOGGER = LoggerFactory.getLogger(ScholarArchiveFetcher.class);

private static final String API_URL = "https://scholar.archive.org/search";

/**
* Gets the query URL by luceneQuery and pageNumber.
*
* @param luceneQuery the search query
* @param pageNumber the number of the page indexed from 0
* @return URL
Siedlerchr marked this conversation as resolved.
Show resolved Hide resolved
*/
@Override
public URL getURLForQuery(QueryNode luceneQuery, int pageNumber) throws URISyntaxException, MalformedURLException, FetcherException {
URIBuilder uriBuilder = new URIBuilder(API_URL);
uriBuilder.addParameter("q", new ScholarArchiveQueryTransformer().transformLuceneQuery(luceneQuery).orElse(""));
uriBuilder.addParameter("from", String.valueOf(getPageSize() * pageNumber));
uriBuilder.addParameter("size", String.valueOf(getPageSize()));
uriBuilder.addParameter("format", "json");

LOGGER.debug("using URL for search {}", uriBuilder.build());
return uriBuilder.build().toURL();
}

@Override
public URLDownload getUrlDownload(URL url) {
URLDownload download = new URLDownload(url);
download.addHeader("Accept", MediaType.APPLICATION_JSON);
return download;
}

/**
* Gets the list of BibEntry by given Json response from scholar archive fetcher API
*
* @return Parser, list of BibEntry
*/
@Override
public Parser getParser() {
return inputStream -> {
JSONObject response = JsonReader.toJsonObject(inputStream);
List<BibEntry> entries = new ArrayList<>();
if (response.has("results")) {
JSONArray results = response.getJSONArray("results");
for (int i = 0; i < results.length(); i++) {
JSONObject jsonEntry = results.getJSONObject(i);
BibEntry entry = parseJSONtoBibtex(jsonEntry);
entries.add(entry);
}
}

return entries;
};
}

@Override
public String getName() {
return FETCHER_NAME;
}

private BibEntry parseJSONtoBibtex(JSONObject jsonEntry) throws ParseException {
try {
BibEntry entry = new BibEntry();
EntryType entryType = StandardEntryType.InCollection;
JSONObject biblio = jsonEntry.optJSONObject("biblio");

JSONArray abstracts = jsonEntry.getJSONArray("abstracts");
String foundAbstract = IntStream.range(0, abstracts.length())
.mapToObj(abstracts::getJSONObject)
.map(object -> object.optString("body"))
.findFirst().orElse("");

String url = Optional.ofNullable(jsonEntry.optJSONObject("fulltext")).map(fullText -> fullText.optString("access_url")).orElse("");

// publication type
String type = biblio.optString("release_type");
entry.setField(StandardField.TYPE, type);
if (type.toLowerCase().contains("book")) {
entryType = StandardEntryType.Book;
} else if (type.toLowerCase().contains("article")) {
entryType = StandardEntryType.Article;
}
entry.setType(entryType);

entry.setField(StandardField.TITLE, biblio.optString("title"));
entry.setField(StandardField.JOURNAL, biblio.optString("container_name"));
entry.setField(StandardField.DOI, biblio.optString("doi"));
entry.setField(StandardField.ISSUE, biblio.optString("issue"));
entry.setField(StandardField.LANGUAGE, biblio.optString("lang_code"));
entry.setField(StandardField.PUBLISHER, biblio.optString("publisher"));

entry.setField(StandardField.YEAR, String.valueOf(biblio.optInt("release_year")));
entry.setField(StandardField.VOLUME, String.valueOf(biblio.optInt("volume_int")));
entry.setField(StandardField.ABSTRACT, foundAbstract);
entry.setField(StandardField.URL, url);

String dateString = biblio.optString("date");
entry.setField(StandardField.DATE, dateString);

// Authors are in contrib_names
if (biblio.has("contrib_names")) {
JSONArray authors = biblio.getJSONArray("contrib_names");
List<String> authorList = new ArrayList<>();
for (int i = 0; i < authors.length(); i++) {
authorList.add(authors.getString(i));
}
AuthorList parsedAuthors = AuthorList.parse(String.join(" and ", authorList));
entry.setField(StandardField.AUTHOR, parsedAuthors.getAsLastFirstNamesWithAnd(false));
}

if (biblio.has("issns")) {
JSONArray issn = biblio.getJSONArray("issns");
List<String> issnList = new ArrayList<>();
for (int i = 0; i < issn.length(); i++) {
issnList.add(issn.getString(i));
}
entry.setField(StandardField.ISSN, String.join(" ", issnList));
}
return entry;
} catch (JSONException exception) {
throw new ParseException("ScholarArchive API JSON format has changed", exception);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package org.jabref.logic.importer.fetcher.transformers;

/**
* This class extends the AbstractQueryTransformer to provide specific implementations
* for transforming standard queries into ones suitable for the Scholar Archive's unique format.
*/
public class ScholarArchiveQueryTransformer extends AbstractQueryTransformer {

@Override
protected String getLogicalAndOperator() {
return " AND ";
}

@Override
protected String getLogicalOrOperator() {
return " OR ";
}

@Override
protected String getLogicalNotOperator() {
return "NOT ";
}

@Override
protected String handleAuthor(String author) {
return createKeyValuePair("contrib_names", author);
}

@Override
protected String handleTitle(String title) {
return createKeyValuePair("title", title);
}

@Override
protected String handleJournal(String journalTitle) {
return createKeyValuePair("container_name", journalTitle);
}

/**
* Handles the year query by formatting it specifically for a range search in the Scholar Archive.
* This method is for an exact year match.
*
* @param year the publication year to be searched in the Scholar Archive.
* @return A string query segment formatted for the year search.
*/
@Override
protected String handleYear(String year) {
return "publication.startDate:[" + year + " TO " + year + "]";
}

/**
* Handles a year range query, transforming it for the Scholar Archive's query format.
* If only a start year is provided, the range will extend to the current year.
*
* @param yearRange the range of years to be searched in the Scholar Archive, usually in the format "startYear-endYear".
* @return A string query segment formatted for the year range search.
*/
@Override
protected String handleYearRange(String yearRange) {
parseYearRange(yearRange);
if (endYear == Integer.MAX_VALUE) {
// If no specific end year is set, it assumes the range extends to the current year.
return yearRange;
}
return "publication.startDate:[" + startYear + " TO " + endYear + "]";
}
}




Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ public void emptyStudyConstructorFillsDatabasesCorrectly() {
new StudyCatalogItem("Medline/PubMed", false),
new StudyCatalogItem("ResearchGate", false),
new StudyCatalogItem("SAO/NASA ADS", false),
new StudyCatalogItem("ScholarArchive", false),
new StudyCatalogItem("SemanticScholar", false),
new StudyCatalogItem("Springer", true),
new StudyCatalogItem("zbMATH", false)
Expand All @@ -59,21 +60,7 @@ public void emptyStudyConstructorFillsDatabasesCorrectly() {

@Test
public void studyConstructorFillsDatabasesCorrectly(@TempDir Path tempDir) {
List<StudyDatabase> databases = List.of(
new StudyDatabase("ACM Portal", true));
Study study = new Study(
List.of("Name"),
"title",
List.of("Q1"),
List.of(),
databases
);
ManageStudyDefinitionViewModel manageStudyDefinitionViewModel = new ManageStudyDefinitionViewModel(
study,
tempDir,
importFormatPreferences,
importerPreferences,
dialogService);
ManageStudyDefinitionViewModel manageStudyDefinitionViewModel = getManageStudyDefinitionViewModel(tempDir);
assertEquals(List.of(
new StudyCatalogItem("ACM Portal", true),
new StudyCatalogItem("ArXiv", false),
Expand All @@ -92,9 +79,28 @@ public void studyConstructorFillsDatabasesCorrectly(@TempDir Path tempDir) {
new StudyCatalogItem("Medline/PubMed", false),
new StudyCatalogItem("ResearchGate", false),
new StudyCatalogItem("SAO/NASA ADS", false),
new StudyCatalogItem("ScholarArchive", false),
new StudyCatalogItem("SemanticScholar", false),
new StudyCatalogItem("Springer", false),
new StudyCatalogItem("zbMATH", false)
), manageStudyDefinitionViewModel.getCatalogs());
}

private ManageStudyDefinitionViewModel getManageStudyDefinitionViewModel(Path tempDir) {
List<StudyDatabase> databases = List.of(
new StudyDatabase("ACM Portal", true));
Study study = new Study(
List.of("Name"),
"title",
List.of("Q1"),
List.of(),
databases
);
return new ManageStudyDefinitionViewModel(
study,
tempDir,
importFormatPreferences,
importerPreferences,
dialogService);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package org.jabref.logic.importer.fetcher;

import java.util.List;

import org.jabref.logic.importer.FetcherException;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.types.StandardEntryType;
import org.jabref.testutils.category.FetcherTest;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

@FetcherTest
public class ScholarArchiveFetcherTest {
private ScholarArchiveFetcher fetcher;
private BibEntry bibEntry;

@BeforeEach
public void setUp() {
fetcher = new ScholarArchiveFetcher();
bibEntry = new BibEntry(StandardEntryType.InCollection)
.withField(StandardField.TITLE, "Query expansion using associated queries")
.withField(StandardField.AUTHOR, "Billerbeck, Bodo and Scholer, Falk and Williams, Hugh E. and Zobel, Justin")
.withField(StandardField.VOLUME, "0")
.withField(StandardField.DOI, "10.1145/956863.956866")
.withField(StandardField.JOURNAL, "Proceedings of the twelfth international conference on Information and knowledge management - CIKM '03")
.withField(StandardField.PUBLISHER, "ACM Press")
.withField(StandardField.TYPE, "paper-conference")
.withField(StandardField.YEAR, "2003")
.withField(StandardField.URL, "https://web.archive.org/web/20170810164449/http://goanna.cs.rmit.edu.au/~jz/fulltext/cikm03.pdf");
}

@Test
public void getNameReturnsCorrectName() {
assertEquals("ScholarArchive", fetcher.getName());
}

@Test
public void performSearchReturnsExpectedResults() throws FetcherException {
Siedlerchr marked this conversation as resolved.
Show resolved Hide resolved
List<BibEntry> fetchedEntries = fetcher.performSearch("query");
fetchedEntries.forEach(entry -> entry.clearField(StandardField.ABSTRACT));
assertTrue(fetchedEntries.contains(bibEntry), "Found the following entries " + fetchedEntries);
}
}



Loading