Skip to content

Commit

Permalink
Fix #10498 Create Fetcher and Transformer for ScholarArchive (#10549)
Browse files Browse the repository at this point in the history
* Create Fetcher and Transformer for ScholarArchive

* Finish change requirement including code style, testing, some error , and comment.

* Finish and fix archive scholar fetcher

* add url

* fix arch

* fix test

* fix var name

* remove comments

* add changelog

* fuck this changelog

---------

Co-authored-by: youliyou <u7156540@anu.edu.au>
Co-authored-by: Siedlerchr <siedlerkiller@gmail.com>
  • Loading branch information
3 people committed Nov 6, 2023
1 parent 94e2285 commit a2aa2c6
Show file tree
Hide file tree
Showing 6 changed files with 304 additions and 15 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv

- We added a dropdown menu to let users change the reference library during AUX file import. [#10472](https://github.com/JabRef/jabref/issues/10472)
- We added a button to let users reset the cite command to the default value. [#10569](https://github.com/JabRef/jabref/issues/10569)
- We added [scholar.archive.org](https://scholar.archive.org/) as a new fetcher. [#10498](https://github.com/JabRef/jabref/issues/10498)

### Changed

Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/jabref/logic/importer/WebFetchers.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.jabref.logic.importer.fetcher.OpenAccessDoi;
import org.jabref.logic.importer.fetcher.ResearchGate;
import org.jabref.logic.importer.fetcher.RfcFetcher;
import org.jabref.logic.importer.fetcher.ScholarArchiveFetcher;
import org.jabref.logic.importer.fetcher.ScienceDirect;
import org.jabref.logic.importer.fetcher.SemanticScholar;
import org.jabref.logic.importer.fetcher.SpringerFetcher;
Expand Down Expand Up @@ -126,6 +127,7 @@ public static SortedSet<SearchBasedFetcher> getSearchBasedFetchers(ImportFormatP
set.add(new ResearchGate(importFormatPreferences));
set.add(new BiodiversityLibrary(importerPreferences));
set.add(new LOBIDFetcher(importerPreferences));
set.add(new ScholarArchiveFetcher());
return set;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package org.jabref.logic.importer.fetcher;

import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.IntStream;

import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.PagedSearchBasedParserFetcher;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.fetcher.transformers.ScholarArchiveQueryTransformer;
import org.jabref.logic.importer.util.JsonReader;
import org.jabref.logic.net.URLDownload;
import org.jabref.model.entry.AuthorList;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.types.EntryType;
import org.jabref.model.entry.types.StandardEntryType;

import jakarta.ws.rs.core.MediaType;
import kong.unirest.json.JSONArray;
import kong.unirest.json.JSONException;
import kong.unirest.json.JSONObject;
import org.apache.http.client.utils.URIBuilder;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ScholarArchiveFetcher implements PagedSearchBasedParserFetcher {

public static final String FETCHER_NAME = "ScholarArchive";

private static final Logger LOGGER = LoggerFactory.getLogger(ScholarArchiveFetcher.class);

private static final String API_URL = "https://scholar.archive.org/search";

/**
* Gets the query URL by luceneQuery and pageNumber.
*
* @param luceneQuery the search query
* @param pageNumber the number of the page indexed from 0
* @return URL
*/
@Override
public URL getURLForQuery(QueryNode luceneQuery, int pageNumber) throws URISyntaxException, MalformedURLException, FetcherException {
URIBuilder uriBuilder = new URIBuilder(API_URL);
uriBuilder.addParameter("q", new ScholarArchiveQueryTransformer().transformLuceneQuery(luceneQuery).orElse(""));
uriBuilder.addParameter("from", String.valueOf(getPageSize() * pageNumber));
uriBuilder.addParameter("size", String.valueOf(getPageSize()));
uriBuilder.addParameter("format", "json");

LOGGER.debug("using URL for search {}", uriBuilder.build());
return uriBuilder.build().toURL();
}

@Override
public URLDownload getUrlDownload(URL url) {
URLDownload download = new URLDownload(url);
download.addHeader("Accept", MediaType.APPLICATION_JSON);
return download;
}

/**
* Gets the list of BibEntry by given Json response from scholar archive fetcher API
*
* @return Parser, list of BibEntry
*/
@Override
public Parser getParser() {
return inputStream -> {
JSONObject response = JsonReader.toJsonObject(inputStream);
List<BibEntry> entries = new ArrayList<>();
if (response.has("results")) {
JSONArray results = response.getJSONArray("results");
for (int i = 0; i < results.length(); i++) {
JSONObject jsonEntry = results.getJSONObject(i);
BibEntry entry = parseJSONtoBibtex(jsonEntry);
entries.add(entry);
}
}

return entries;
};
}

@Override
public String getName() {
return FETCHER_NAME;
}

private BibEntry parseJSONtoBibtex(JSONObject jsonEntry) throws ParseException {
try {
BibEntry entry = new BibEntry();
EntryType entryType = StandardEntryType.InCollection;
JSONObject biblio = jsonEntry.optJSONObject("biblio");

JSONArray abstracts = jsonEntry.getJSONArray("abstracts");
String foundAbstract = IntStream.range(0, abstracts.length())
.mapToObj(abstracts::getJSONObject)
.map(object -> object.optString("body"))
.findFirst().orElse("");

String url = Optional.ofNullable(jsonEntry.optJSONObject("fulltext")).map(fullText -> fullText.optString("access_url")).orElse("");

// publication type
String type = biblio.optString("release_type");
entry.setField(StandardField.TYPE, type);
if (type.toLowerCase().contains("book")) {
entryType = StandardEntryType.Book;
} else if (type.toLowerCase().contains("article")) {
entryType = StandardEntryType.Article;
}
entry.setType(entryType);

entry.setField(StandardField.TITLE, biblio.optString("title"));
entry.setField(StandardField.JOURNAL, biblio.optString("container_name"));
entry.setField(StandardField.DOI, biblio.optString("doi"));
entry.setField(StandardField.ISSUE, biblio.optString("issue"));
entry.setField(StandardField.LANGUAGE, biblio.optString("lang_code"));
entry.setField(StandardField.PUBLISHER, biblio.optString("publisher"));

entry.setField(StandardField.YEAR, String.valueOf(biblio.optInt("release_year")));
entry.setField(StandardField.VOLUME, String.valueOf(biblio.optInt("volume_int")));
entry.setField(StandardField.ABSTRACT, foundAbstract);
entry.setField(StandardField.URL, url);

String dateString = biblio.optString("date");
entry.setField(StandardField.DATE, dateString);

// Authors are in contrib_names
if (biblio.has("contrib_names")) {
JSONArray authors = biblio.getJSONArray("contrib_names");
List<String> authorList = new ArrayList<>();
for (int i = 0; i < authors.length(); i++) {
authorList.add(authors.getString(i));
}
AuthorList parsedAuthors = AuthorList.parse(String.join(" and ", authorList));
entry.setField(StandardField.AUTHOR, parsedAuthors.getAsLastFirstNamesWithAnd(false));
}

if (biblio.has("issns")) {
JSONArray issn = biblio.getJSONArray("issns");
List<String> issnList = new ArrayList<>();
for (int i = 0; i < issn.length(); i++) {
issnList.add(issn.getString(i));
}
entry.setField(StandardField.ISSN, String.join(" ", issnList));
}
return entry;
} catch (JSONException exception) {
throw new ParseException("ScholarArchive API JSON format has changed", exception);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package org.jabref.logic.importer.fetcher.transformers;

/**
* This class extends the AbstractQueryTransformer to provide specific implementations
* for transforming standard queries into ones suitable for the Scholar Archive's unique format.
*/
public class ScholarArchiveQueryTransformer extends AbstractQueryTransformer {

@Override
protected String getLogicalAndOperator() {
return " AND ";
}

@Override
protected String getLogicalOrOperator() {
return " OR ";
}

@Override
protected String getLogicalNotOperator() {
return "NOT ";
}

@Override
protected String handleAuthor(String author) {
return createKeyValuePair("contrib_names", author);
}

@Override
protected String handleTitle(String title) {
return createKeyValuePair("title", title);
}

@Override
protected String handleJournal(String journalTitle) {
return createKeyValuePair("container_name", journalTitle);
}

/**
* Handles the year query by formatting it specifically for a range search in the Scholar Archive.
* This method is for an exact year match.
*
* @param year the publication year to be searched in the Scholar Archive.
* @return A string query segment formatted for the year search.
*/
@Override
protected String handleYear(String year) {
return "publication.startDate:[" + year + " TO " + year + "]";
}

/**
* Handles a year range query, transforming it for the Scholar Archive's query format.
* If only a start year is provided, the range will extend to the current year.
*
* @param yearRange the range of years to be searched in the Scholar Archive, usually in the format "startYear-endYear".
* @return A string query segment formatted for the year range search.
*/
@Override
protected String handleYearRange(String yearRange) {
parseYearRange(yearRange);
if (endYear == Integer.MAX_VALUE) {
// If no specific end year is set, it assumes the range extends to the current year.
return yearRange;
}
return "publication.startDate:[" + startYear + " TO " + endYear + "]";
}
}




Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ public void emptyStudyConstructorFillsDatabasesCorrectly() {
new StudyCatalogItem("Medline/PubMed", false),
new StudyCatalogItem("ResearchGate", false),
new StudyCatalogItem("SAO/NASA ADS", false),
new StudyCatalogItem("ScholarArchive", false),
new StudyCatalogItem("SemanticScholar", false),
new StudyCatalogItem("Springer", true),
new StudyCatalogItem("zbMATH", false)
Expand All @@ -59,21 +60,7 @@ public void emptyStudyConstructorFillsDatabasesCorrectly() {

@Test
public void studyConstructorFillsDatabasesCorrectly(@TempDir Path tempDir) {
List<StudyDatabase> databases = List.of(
new StudyDatabase("ACM Portal", true));
Study study = new Study(
List.of("Name"),
"title",
List.of("Q1"),
List.of(),
databases
);
ManageStudyDefinitionViewModel manageStudyDefinitionViewModel = new ManageStudyDefinitionViewModel(
study,
tempDir,
importFormatPreferences,
importerPreferences,
dialogService);
ManageStudyDefinitionViewModel manageStudyDefinitionViewModel = getManageStudyDefinitionViewModel(tempDir);
assertEquals(List.of(
new StudyCatalogItem("ACM Portal", true),
new StudyCatalogItem("ArXiv", false),
Expand All @@ -92,9 +79,28 @@ public void studyConstructorFillsDatabasesCorrectly(@TempDir Path tempDir) {
new StudyCatalogItem("Medline/PubMed", false),
new StudyCatalogItem("ResearchGate", false),
new StudyCatalogItem("SAO/NASA ADS", false),
new StudyCatalogItem("ScholarArchive", false),
new StudyCatalogItem("SemanticScholar", false),
new StudyCatalogItem("Springer", false),
new StudyCatalogItem("zbMATH", false)
), manageStudyDefinitionViewModel.getCatalogs());
}

private ManageStudyDefinitionViewModel getManageStudyDefinitionViewModel(Path tempDir) {
List<StudyDatabase> databases = List.of(
new StudyDatabase("ACM Portal", true));
Study study = new Study(
List.of("Name"),
"title",
List.of("Q1"),
List.of(),
databases
);
return new ManageStudyDefinitionViewModel(
study,
tempDir,
importFormatPreferences,
importerPreferences,
dialogService);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package org.jabref.logic.importer.fetcher;

import java.util.List;

import org.jabref.logic.importer.FetcherException;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.types.StandardEntryType;
import org.jabref.testutils.category.FetcherTest;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

@FetcherTest
public class ScholarArchiveFetcherTest {
private ScholarArchiveFetcher fetcher;
private BibEntry bibEntry;

@BeforeEach
public void setUp() {
fetcher = new ScholarArchiveFetcher();
bibEntry = new BibEntry(StandardEntryType.InCollection)
.withField(StandardField.TITLE, "Query expansion using associated queries")
.withField(StandardField.AUTHOR, "Billerbeck, Bodo and Scholer, Falk and Williams, Hugh E. and Zobel, Justin")
.withField(StandardField.VOLUME, "0")
.withField(StandardField.DOI, "10.1145/956863.956866")
.withField(StandardField.JOURNAL, "Proceedings of the twelfth international conference on Information and knowledge management - CIKM '03")
.withField(StandardField.PUBLISHER, "ACM Press")
.withField(StandardField.TYPE, "paper-conference")
.withField(StandardField.YEAR, "2003")
.withField(StandardField.URL, "https://web.archive.org/web/20170810164449/http://goanna.cs.rmit.edu.au/~jz/fulltext/cikm03.pdf");
}

@Test
public void getNameReturnsCorrectName() {
assertEquals("ScholarArchive", fetcher.getName());
}

@Test
public void performSearchReturnsExpectedResults() throws FetcherException {
List<BibEntry> fetchedEntries = fetcher.performSearch("query");
fetchedEntries.forEach(entry -> entry.clearField(StandardField.ABSTRACT));
assertTrue(fetchedEntries.contains(bibEntry), "Found the following entries " + fetchedEntries);
}
}



0 comments on commit a2aa2c6

Please sign in to comment.