Skip to content

Commit

Permalink
Feature/implement complex queries (#7350)
Browse files Browse the repository at this point in the history
  • Loading branch information
DominikVoigt authored Jan 26, 2021
1 parent 7117b61 commit 034cf8c
Show file tree
Hide file tree
Showing 53 changed files with 1,617 additions and 371 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,40 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Optional;

import org.jabref.logic.importer.fetcher.ComplexSearchQuery;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.paging.Page;

import org.apache.lucene.queryparser.flexible.core.QueryNodeParseException;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser;
import org.apache.lucene.queryparser.flexible.standard.parser.StandardSyntaxParser;

public interface PagedSearchBasedFetcher extends SearchBasedFetcher {

/**
* @param complexSearchQuery the complex query defining all fielded search parameters
* @param pageNumber requested site number indexed from 0
* @param luceneQuery the root node of the lucene query
* @param pageNumber requested site number indexed from 0
* @return Page with search results
*/
Page<BibEntry> performSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException;
Page<BibEntry> performSearchPaged(QueryNode luceneQuery, int pageNumber) throws FetcherException;

/**
* @param complexSearchQuery query string that can be parsed into a complex search query
* @param pageNumber requested site number indexed from 0
* @param searchQuery query string that can be parsed into a lucene query
* @param pageNumber requested site number indexed from 0
* @return Page with search results
*/
default Page<BibEntry> performSearchPaged(String complexSearchQuery, int pageNumber) throws FetcherException {
if (complexSearchQuery.isBlank()) {
return new Page<>(complexSearchQuery, pageNumber, Collections.emptyList());
default Page<BibEntry> performSearchPaged(String searchQuery, int pageNumber) throws FetcherException {
if (searchQuery.isBlank()) {
return new Page<>(searchQuery, pageNumber, Collections.emptyList());
}
SyntaxParser parser = new StandardSyntaxParser();
final String NO_EXPLICIT_FIELD = "default";
try {
return this.performSearchPaged(parser.parse(searchQuery, NO_EXPLICIT_FIELD), pageNumber);
} catch (QueryNodeParseException e) {
throw new FetcherException("An error occurred during parsing of the query.");
}
QueryParser queryParser = new QueryParser();
Optional<ComplexSearchQuery> generatedQuery = queryParser.parseQueryStringIntoComplexQuery(complexSearchQuery);
// Otherwise just use query as a default term
return this.performSearchPaged(generatedQuery.orElse(ComplexSearchQuery.builder().defaultFieldPhrase(complexSearchQuery).build()), pageNumber);
}

/**
Expand All @@ -40,13 +46,14 @@ default int getPageSize() {
return 20;
}

@Override
default List<BibEntry> performSearch(ComplexSearchQuery complexSearchQuery) throws FetcherException {
return new ArrayList<>(performSearchPaged(complexSearchQuery, 0).getContent());
/**
* This method is used to send complex queries using fielded search.
*
* @param luceneQuery the root node of the lucene query
* @return a list of {@link BibEntry}, which are matched by the query (may be empty)
*/
default List<BibEntry> performSearch(QueryNode luceneQuery) throws FetcherException {
return new ArrayList<>(performSearchPaged(luceneQuery, 0).getContent());
}

@Override
default List<BibEntry> performSearch(String complexSearchQuery) throws FetcherException {
return new ArrayList<>(performSearchPaged(complexSearchQuery, 0).getContent());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,23 @@
import java.net.URL;
import java.util.List;

import org.jabref.logic.importer.fetcher.ComplexSearchQuery;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.paging.Page;

import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;

public interface PagedSearchBasedParserFetcher extends SearchBasedParserFetcher, PagedSearchBasedFetcher {

@Override
default Page<BibEntry> performSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException {
default Page<BibEntry> performSearchPaged(QueryNode luceneQuery, int pageNumber) throws FetcherException {
// ADR-0014
URL urlForQuery;
try {
urlForQuery = getComplexQueryURL(complexSearchQuery, pageNumber);
urlForQuery = getURLForQuery(luceneQuery, pageNumber);
} catch (URISyntaxException | MalformedURLException e) {
throw new FetcherException("Search URI crafted from complex search query is malformed", e);
}
return new Page<>(complexSearchQuery.toString(), pageNumber, getBibEntries(urlForQuery));
return new Page<>(luceneQuery.toString(), pageNumber, getBibEntries(urlForQuery));
}

private List<BibEntry> getBibEntries(URL urlForQuery) throws FetcherException {
Expand All @@ -39,34 +40,18 @@ private List<BibEntry> getBibEntries(URL urlForQuery) throws FetcherException {

/**
* Constructs a URL based on the query, size and page number.
*
* @param query the search query
* @param luceneQuery the search query
* @param pageNumber the number of the page indexed from 0
*/
URL getURLForQuery(String query, int pageNumber) throws URISyntaxException, MalformedURLException;

/**
* Constructs a URL based on the query, size and page number.
*
* @param complexSearchQuery the search query
* @param pageNumber the number of the page indexed from 0
*/
default URL getComplexQueryURL(ComplexSearchQuery complexSearchQuery, int pageNumber) throws URISyntaxException, MalformedURLException {
return getURLForQuery(complexSearchQuery.toString(), pageNumber);
}

@Override
default List<BibEntry> performSearch(ComplexSearchQuery complexSearchQuery) throws FetcherException {
return SearchBasedParserFetcher.super.performSearch(complexSearchQuery);
}
URL getURLForQuery(QueryNode luceneQuery, int pageNumber) throws URISyntaxException, MalformedURLException, FetcherException;

@Override
default URL getURLForQuery(String query) throws URISyntaxException, MalformedURLException, FetcherException {
return getURLForQuery(query, 0);
default URL getURLForQuery(QueryNode luceneQuery) throws URISyntaxException, MalformedURLException, FetcherException {
return getURLForQuery(luceneQuery, 0);
}

@Override
default URL getURLForQuery(ComplexSearchQuery query) throws URISyntaxException, MalformedURLException, FetcherException {
return getComplexQueryURL(query, 0);
default List<BibEntry> performSearch(QueryNode luceneQuery) throws FetcherException {
return SearchBasedParserFetcher.super.performSearch(luceneQuery);
}
}
30 changes: 19 additions & 11 deletions src/main/java/org/jabref/logic/importer/SearchBasedFetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@

import java.util.Collections;
import java.util.List;
import java.util.Optional;

import org.jabref.logic.importer.fetcher.ComplexSearchQuery;
import org.jabref.model.entry.BibEntry;

import org.apache.lucene.queryparser.flexible.core.QueryNodeParseException;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser;
import org.apache.lucene.queryparser.flexible.standard.parser.StandardSyntaxParser;

import static org.jabref.logic.importer.fetcher.transformators.AbstractQueryTransformer.NO_EXPLICIT_FIELD;

/**
* Searches web resources for bibliographic information based on a free-text query.
* May return multiple search hits.
Expand All @@ -16,24 +21,27 @@ public interface SearchBasedFetcher extends WebFetcher {
/**
* This method is used to send complex queries using fielded search.
*
* @param complexSearchQuery the complex search query defining all fielded search parameters
* @param luceneQuery the root node of the lucene query
* @return a list of {@link BibEntry}, which are matched by the query (may be empty)
*/
List<BibEntry> performSearch(ComplexSearchQuery complexSearchQuery) throws FetcherException;
List<BibEntry> performSearch(QueryNode luceneQuery) throws FetcherException;

/**
* Looks for hits which are matched by the given free-text query.
*
* @param complexSearchQuery query string that can be parsed into a complex search query
* @param searchQuery query string that can be parsed into a lucene query
* @return a list of {@link BibEntry}, which are matched by the query (may be empty)
*/
default List<BibEntry> performSearch(String complexSearchQuery) throws FetcherException {
if (complexSearchQuery.isBlank()) {
default List<BibEntry> performSearch(String searchQuery) throws FetcherException {
if (searchQuery.isBlank()) {
return Collections.emptyList();
}
QueryParser queryParser = new QueryParser();
Optional<ComplexSearchQuery> generatedQuery = queryParser.parseQueryStringIntoComplexQuery(complexSearchQuery);
// Otherwise just use query as a default term
return this.performSearch(generatedQuery.orElse(ComplexSearchQuery.builder().defaultFieldPhrase(complexSearchQuery).build()));
SyntaxParser parser = new StandardSyntaxParser();

try {
return this.performSearch(parser.parse(searchQuery, NO_EXPLICIT_FIELD));
} catch (QueryNodeParseException e) {
throw new FetcherException("An error occured when parsing the query");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
import java.util.List;

import org.jabref.logic.cleanup.Formatter;
import org.jabref.logic.importer.fetcher.ComplexSearchQuery;
import org.jabref.model.entry.BibEntry;

import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;

/**
* Provides a convenient interface for search-based fetcher, which follow the usual three-step procedure:
* <ol>
Expand All @@ -26,14 +27,14 @@ public interface SearchBasedParserFetcher extends SearchBasedFetcher {
* This method is necessary as the performSearch method does not support certain URL parameters that are used for
* fielded search, such as a title, author, or year parameter.
*
* @param complexSearchQuery the search query defining all fielded search parameters
* @param luceneQuery the root node of the lucene query
*/
@Override
default List<BibEntry> performSearch(ComplexSearchQuery complexSearchQuery) throws FetcherException {
default List<BibEntry> performSearch(QueryNode luceneQuery) throws FetcherException {
// ADR-0014
URL urlForQuery;
try {
urlForQuery = getURLForQuery(complexSearchQuery);
urlForQuery = getURLForQuery(luceneQuery);
} catch (URISyntaxException | MalformedURLException | FetcherException e) {
throw new FetcherException("Search URI crafted from complex search query is malformed", e);
}
Expand All @@ -52,22 +53,17 @@ private List<BibEntry> getBibEntries(URL urlForQuery) throws FetcherException {
}
}

default URL getURLForQuery(ComplexSearchQuery query) throws URISyntaxException, MalformedURLException, FetcherException {
// Default implementation behaves as getURLForQuery treating complex query as plain string query
return this.getURLForQuery(query.toString());
}

/**
* Returns the parser used to convert the response to a list of {@link BibEntry}.
*/
Parser getParser();

/**
* Constructs a URL based on the query.
* Constructs a URL based on the lucene query.
*
* @param query the search query
* @param luceneQuery the root node of the lucene query
*/
URL getURLForQuery(String query) throws URISyntaxException, MalformedURLException, FetcherException;
URL getURLForQuery(QueryNode luceneQuery) throws URISyntaxException, MalformedURLException, FetcherException;

/**
* Performs a cleanup of the fetched entry.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.SearchBasedParserFetcher;
import org.jabref.logic.importer.fetcher.transformators.DefaultQueryTransformer;
import org.jabref.logic.importer.fileformat.BibtexParser;
import org.jabref.model.util.DummyFileUpdateMonitor;

import org.apache.http.client.utils.URIBuilder;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;

public class ACMPortalFetcher implements SearchBasedParserFetcher {

Expand All @@ -36,15 +38,16 @@ public Optional<HelpFile> getHelpPage() {
return Optional.of(HelpFile.FETCHER_ACM);
}

private static String createQueryString(String query) {
private static String createQueryString(QueryNode query) throws FetcherException {
String queryString = new DefaultQueryTransformer().transformLuceneQuery(query).orElse("");
// Query syntax to search for an entry that matches "one" and "two" in any field is: (+one +two)
return "(%252B" + query.trim().replaceAll("\\s+", "%20%252B") + ")";
return "(%252B" + queryString.trim().replaceAll("\\s+", "%20%252B") + ")";
}

@Override
public URL getURLForQuery(String query) throws URISyntaxException, MalformedURLException, FetcherException {
public URL getURLForQuery(QueryNode luceneQuery) throws URISyntaxException, MalformedURLException, FetcherException {
URIBuilder uriBuilder = new URIBuilder(SEARCH_URL);
uriBuilder.addParameter("query", createQueryString(query)); // Search all fields
uriBuilder.addParameter("query", createQueryString(luceneQuery)); // Search all fields
uriBuilder.addParameter("within", "owners.owner=GUIDE"); // Search within the ACM Guide to Computing Literature (encompasses the ACM Full-Text Collection)
uriBuilder.addParameter("expformat", "bibtex"); // BibTeX format
return uriBuilder.build().toURL();
Expand Down
35 changes: 19 additions & 16 deletions src/main/java/org/jabref/logic/importer/fetcher/ArXiv.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.jabref.logic.importer.IdFetcher;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.PagedSearchBasedFetcher;
import org.jabref.logic.importer.fetcher.transformators.ArXivQueryTransformer;
import org.jabref.logic.util.io.XMLUtil;
import org.jabref.logic.util.strings.StringSimilarity;
import org.jabref.model.entry.BibEntry;
Expand All @@ -36,6 +37,7 @@
import org.jabref.model.util.OptionalUtil;

import org.apache.http.client.utils.URIBuilder;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
Expand Down Expand Up @@ -252,25 +254,26 @@ public Optional<HelpFile> getHelpPage() {
/**
* Constructs a complex query string using the field prefixes specified at https://arxiv.org/help/api/user-manual
*
* @param complexSearchQuery the search query defining all fielded search parameters
* @param luceneQuery the root node of the lucene query
* @return A list of entries matching the complex query
*/
@Override
public Page<BibEntry> performSearchPaged(ComplexSearchQuery complexSearchQuery, int pageNumber) throws FetcherException {
List<String> searchTerms = new ArrayList<>();
complexSearchQuery.getAuthors().forEach(author -> searchTerms.add("au:" + author));
complexSearchQuery.getTitlePhrases().forEach(title -> searchTerms.add("ti:" + title));
complexSearchQuery.getAbstractPhrases().forEach(abstr -> searchTerms.add("abs:" + abstr));
complexSearchQuery.getJournal().ifPresent(journal -> searchTerms.add("jr:" + journal));
// Since ArXiv API does not support year search, we ignore the year related terms
complexSearchQuery.getToYear().ifPresent(year -> searchTerms.add(year.toString()));
searchTerms.addAll(complexSearchQuery.getDefaultFieldPhrases());
String complexQueryString = String.join(" AND ", searchTerms);

List<BibEntry> searchResult = searchForEntries(complexQueryString, pageNumber).stream()
.map((arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator()))
.collect(Collectors.toList());
return new Page<>(complexQueryString, pageNumber, searchResult);
public Page<BibEntry> performSearchPaged(QueryNode luceneQuery, int pageNumber) throws FetcherException {
ArXivQueryTransformer transformer = new ArXivQueryTransformer();
String transformedQuery = transformer.transformLuceneQuery(luceneQuery).orElse("");
List<BibEntry> searchResult = searchForEntries(transformedQuery, pageNumber).stream()
.map((arXivEntry) -> arXivEntry.toBibEntry(importFormatPreferences.getKeywordSeparator()))
.collect(Collectors.toList());
return new Page<>(transformedQuery, pageNumber, filterYears(searchResult, transformer));
}

private List<BibEntry> filterYears(List<BibEntry> searchResult, ArXivQueryTransformer transformer) {
return searchResult.stream()
.filter(entry -> entry.getField(StandardField.DATE).isPresent())
// Filter the date field for year only
.filter(entry -> transformer.getEndYear().isEmpty() || Integer.parseInt(entry.getField(StandardField.DATE).get().substring(0, 4)) <= transformer.getEndYear().get())
.filter(entry -> transformer.getStartYear().isEmpty() || Integer.parseInt(entry.getField(StandardField.DATE).get().substring(0, 4)) >= transformer.getStartYear().get())
.collect(Collectors.toList());
}

@Override
Expand Down
Loading

0 comments on commit 034cf8c

Please sign in to comment.