From 49957c13f8317f52acd235c252ded4fd710637aa Mon Sep 17 00:00:00 2001 From: Andrew Cholewa Date: Wed, 12 Apr 2017 10:31:38 -0500 Subject: [PATCH] Stops using `IndexSearcher::count` in Lucene. --It looks like `IndexSearcher::count` fires off an entire query, and then counts the number of documents that were hit. Currently, we are using it to determine the total number of dimension rows that satisfy a given query. However, this is unnecessary and wasteful, because the `TopDocs` object returned by the query for the actual data _also_ contains information on the total number of documents hit by the query. --- CHANGELOG.md | 6 +++++ .../dimension/impl/LuceneSearchProvider.java | 12 +++------ .../dimension/impl/SearchProviderSpec.groovy | 27 ++++++++++++++++++- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 945583de5c..86dfa51507 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -69,6 +69,12 @@ Current ### Changed: +- [Reduced number of queries sent by `LuceneSearchProvider` by 50% in the common case](https://github.com/yahoo/fili/pull/234) + * Before, we were using `IndexSearcher::count` to get the total number of documents, which spawned an entire second query + (so two Lucene queries rather than one when requesting the first page of results). We now pull that information from + metadata generated by query we run to get the actual results. + + - [Update Lucene version 5.3.0 -> 6.5.0](https://github.com/yahoo/fili/pull/233) * [Added IndexSearcher#getQueryCache and #getQueryCachingPolicy](https://issues.apache.org/jira/browse/LUCENE-6838) * [org.apache.lucene.search.Filter is now deprecated](http://issues.apache.org/jira/browse/LUCENE-6301). diff --git a/fili-core/src/main/java/com/yahoo/bard/webservice/data/dimension/impl/LuceneSearchProvider.java b/fili-core/src/main/java/com/yahoo/bard/webservice/data/dimension/impl/LuceneSearchProvider.java index 166a460cb4..4be60f4e72 100644 --- a/fili-core/src/main/java/com/yahoo/bard/webservice/data/dimension/impl/LuceneSearchProvider.java +++ b/fili-core/src/main/java/com/yahoo/bard/webservice/data/dimension/impl/LuceneSearchProvider.java @@ -574,13 +574,15 @@ private Pagination getResultsPage(Query query, PaginationParameter try { ScoreDoc[] hits; try (TimedPhase timer = RequestLog.startTiming("QueryingLucene")) { - hits = getPageOfData( + TopDocs hitDocs = getPageOfData( luceneIndexSearcher, null, query, perPage, requestedPageNumber - ).scoreDocs; + ); + hits = hitDocs.scoreDocs; + documentCount = hitDocs.totalHits; if (hits.length == 0) { if (requestedPageNumber == 1) { return new SinglePagePagination<>(Collections.emptyList(), paginationParameters, 0); @@ -615,12 +617,6 @@ private Pagination getResultsPage(Query query, PaginationParameter .map(dimension::findDimensionRowByKeyValue) .collect(Collectors.toCollection(TreeSet::new)); } - - documentCount = luceneIndexSearcher.count(query); //throws the caught IOException - } catch (IOException e) { - LOG.error("Unable to get count of matched rows for the query " + query.toString() + - " with " + paginationParameters.toString()); - throw new RuntimeException(e); } finally { lock.readLock().unlock(); } diff --git a/fili-core/src/test/groovy/com/yahoo/bard/webservice/data/dimension/impl/SearchProviderSpec.groovy b/fili-core/src/test/groovy/com/yahoo/bard/webservice/data/dimension/impl/SearchProviderSpec.groovy index 2ab6d80cdf..01415c1efe 100644 --- a/fili-core/src/test/groovy/com/yahoo/bard/webservice/data/dimension/impl/SearchProviderSpec.groovy +++ b/fili-core/src/test/groovy/com/yahoo/bard/webservice/data/dimension/impl/SearchProviderSpec.groovy @@ -16,6 +16,7 @@ import com.yahoo.bard.webservice.data.dimension.KeyValueStore import com.yahoo.bard.webservice.data.dimension.MapStoreManager import com.yahoo.bard.webservice.data.dimension.SearchProvider import com.yahoo.bard.webservice.table.LogicalTable +import com.yahoo.bard.webservice.util.Pagination import com.yahoo.bard.webservice.web.ApiFilter import com.yahoo.bard.webservice.web.FilterOperation import com.yahoo.bard.webservice.web.util.PaginationParameters @@ -501,12 +502,36 @@ abstract class SearchProviderSpec extends Specificatio searchProvider.findFilteredDimensionRowsPaged([newDescription] as Set, paginationParameters).getPageOfData() == [dimensionRow2new] as List } + def "The pagination information contains the correct number of results, but only sends the desired page"() { + setup: "Given a filter that will filter down to three rows" + /* Expected rows, not necessarily in this order: + name: "hawk", description: "this is a raptor" + name: "eagle", description: "this is a raptor" + name: "kumquat", description: "this is not an animal" + */ + Set filters = [ + buildFilter("animal|desc-startswith[this]"), + buildFilter("animal|desc-notin[this is an owl]") + ] + and: "We get the second page, where each page has two rows (so the last page has to have only one result)" + PaginationParameters parameters = new PaginationParameters(2, 2) + + when: "We query the search provider" + Pagination resultsPage = searchProvider.findFilteredDimensionRowsPaged(filters, parameters) + + then: "We get only the last page of results (which has one value)" + resultsPage.getPageOfData().size() == 1 + + and: "The pagination metadata includes the correct number of total results" + resultsPage.numResults == 3 + } + /** * Checks that this search provider's indices have been cleared. * * @return true if the search provider's indices have been cleared properly, false otherwise */ - abstract boolean indicesHaveBeenCleared(); + abstract boolean indicesHaveBeenCleared() ApiFilter buildFilter(String filterQuery) { new ApiFilter(filterQuery, spaceIdDictionary)