Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize sort on numeric long and date fields #39770

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -872,6 +872,9 @@ class BuildPlugin implements Plugin<Project> {
// TODO: remove this once ctx isn't added to update script params in 7.0
test.systemProperty 'es.scripting.update.ctx_in_params', 'false'

// TODO: remove when sort optimization is merged
test.systemProperty 'es.search.long_sort_optimized', 'true'

test.testLogging { TestLoggingContainer logging ->
logging.showExceptions = true
logging.showCauses = true
Expand Down
128 changes: 128 additions & 0 deletions server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,26 +21,36 @@

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.queries.MinDocQuery;
import org.apache.lucene.queries.SearchAfterSortedDocQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.TotalHits;
import org.elasticsearch.action.search.SearchTask;
import org.elasticsearch.common.Booleans;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.search.TopDocsAndMaxScore;
import org.elasticsearch.common.util.concurrent.QueueResizingEsThreadPoolExecutor;
import org.elasticsearch.index.IndexSortConfig;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.DateFieldMapper.DateFieldType;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.SearchPhase;
import org.elasticsearch.search.SearchService;
Expand All @@ -57,6 +67,8 @@
import org.elasticsearch.tasks.TaskCancelledException;
import org.elasticsearch.threadpool.ThreadPool;

import java.io.IOException;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.concurrent.ExecutorService;
import java.util.function.Consumer;
Expand All @@ -67,6 +79,7 @@
import static org.elasticsearch.search.query.QueryCollectorContext.createMinScoreCollectorContext;
import static org.elasticsearch.search.query.QueryCollectorContext.createMultiCollectorContext;
import static org.elasticsearch.search.query.TopDocsCollectorContext.createTopDocsCollectorContext;
import static org.elasticsearch.search.query.TopDocsCollectorContext.shortcutTotalHitCount;


/**
Expand All @@ -75,6 +88,8 @@
*/
public class QueryPhase implements SearchPhase {
private static final Logger LOGGER = LogManager.getLogger(QueryPhase.class);
public static final boolean SYS_PROP_LONG_SORT_OPTIMIZED =
Booleans.parseBoolean(System.getProperty("es.search.long_sort_optimized", "false"));

private final AggregationPhase aggregationPhase;
private final SuggestPhase suggestPhase;
Expand Down Expand Up @@ -133,6 +148,7 @@ public void execute(SearchContext searchContext) throws QueryPhaseExecutionExcep
static boolean execute(SearchContext searchContext,
final IndexSearcher searcher,
Consumer<Runnable> checkCancellationSetter) throws QueryPhaseExecutionException {
SortAndFormats sortAndFormatsForRewrittenNumericSort = null;
final IndexReader reader = searcher.getIndexReader();
QuerySearchResult queryResult = searchContext.queryResult();
queryResult.searchTimedOut(false);
Expand Down Expand Up @@ -204,6 +220,25 @@ static boolean execute(SearchContext searchContext,
hasFilterCollector = true;
}

// try to rewrite numeric or date sort to the optimized distanceFeatureQuery
if ((searchContext.sort() != null) && SYS_PROP_LONG_SORT_OPTIMIZED) {
Query rewrittenQuery = tryRewriteLongSort(searchContext, searcher.getIndexReader(), query, hasFilterCollector);
if (rewrittenQuery != null) {
query = rewrittenQuery;
// modify sorts: add sort on _score as 1st sort, and move the sort on the original field as the 2nd sort
SortField[] oldSortFields = searchContext.sort().sort.getSort();
DocValueFormat[] oldFormats = searchContext.sort().formats;
SortField[] newSortFields = new SortField[oldSortFields.length + 1];
DocValueFormat[] newFormats = new DocValueFormat[oldSortFields.length + 1];
newSortFields[0] = SortField.FIELD_SCORE;
newFormats[0] = DocValueFormat.RAW;
System.arraycopy(oldSortFields, 0, newSortFields, 1, oldSortFields.length);
System.arraycopy(oldFormats, 0, newFormats, 1, oldFormats.length);
sortAndFormatsForRewrittenNumericSort = searchContext.sort(); // stash SortAndFormats to restore it later
searchContext.sort(new SortAndFormats(new Sort(newSortFields), newFormats));
}
}

boolean timeoutSet = scrollContext == null && searchContext.timeout() != null &&
searchContext.timeout().equals(SearchService.NO_TIMEOUT) == false;

Expand Down Expand Up @@ -290,6 +325,13 @@ static boolean execute(SearchContext searchContext,
for (QueryCollectorContext ctx : collectors) {
ctx.postProcess(result);
}

// if we rewrote numeric long or date sort, restore fieldDocs based on the original sort
if (sortAndFormatsForRewrittenNumericSort != null) {
searchContext.sort(sortAndFormatsForRewrittenNumericSort); // restore SortAndFormats
restoreTopFieldDocs(result, sortAndFormatsForRewrittenNumericSort);
}

ExecutorService executor = searchContext.indexShard().getThreadPool().executor(ThreadPool.Names.SEARCH);
if (executor instanceof QueueResizingEsThreadPoolExecutor) {
QueueResizingEsThreadPoolExecutor rExecutor = (QueueResizingEsThreadPoolExecutor) executor;
Expand All @@ -306,6 +348,92 @@ static boolean execute(SearchContext searchContext,
}
}

private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader reader,
Query query, boolean hasFilterCollector) throws IOException {
if (searchContext.searchAfter() != null) return null;
if (searchContext.scrollContext() != null) return null;
if (searchContext.collapse() != null) return null;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder whether we need to check that track_scores is set to false too. Maybe we don't, but in this case too I think it'd be worth to leave a comment about it.

if (searchContext.trackScores()) return null;
if (searchContext.aggregations() != null) return null;
Sort sort = searchContext.sort().sort;
SortField sortField = sort.getSort()[0];
if (SortField.Type.LONG.equals(IndexSortConfig.getSortFieldType(sortField)) == false) return null;

// check if this is a field of type Long or Date, that is indexed and has doc values
String fieldName = sortField.getField();
if (fieldName == null) return null; // happens when _score or _doc is the 1st sort field
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should probably also return null when sortField.getType is not long. Not doing it would not cause bugs to my knowledge, but Lucene supports providing custom comparators, while we are assuming the natural number ordering here.

if (searchContext.mapperService() == null) return null; // mapperService can be null in tests
final MappedFieldType fieldType = searchContext.mapperService().fullName(fieldName);
if (fieldType == null) return null; // for unmapped fields, default behaviour depending on "unmapped_type" flag
if ((fieldType.typeName().equals("long") == false) && (fieldType instanceof DateFieldType == false)) return null;
if (fieldType.indexOptions() == IndexOptions.NONE) return null; //TODO: change to pointDataDimensionCount() when implemented
if (fieldType.hasDocValues() == false) return null;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to check the missing value as well to make sure this optimization is applicable.

// check that all sorts are actual document fields or _doc
for (int i = 1; i < sort.getSort().length; i++) {
SortField sField = sort.getSort()[i];
String sFieldName = sField.getField();
if (sFieldName == null) {
if (SortField.FIELD_DOC.equals(sField) == false) return null;
} else {
if (searchContext.mapperService().fullName(sFieldName) == null) return null; // could be _script field that uses _score
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we need to disable the optimization when sorting by a script as well, since scripts may use the score. Or turn it into a whitelist rather than a blacklist and only enable this optimization when all sort fields are either actual fields or _doc.


// check that setting of missing values allows optimization
if (sortField.getMissingValue() == null) return null;
Long missingValue = (Long) sortField.getMissingValue();
boolean missingValuesAccordingToSort = (sortField.getReverse() && (missingValue == Long.MIN_VALUE)) ||
((sortField.getReverse() == false) && (missingValue == Long.MAX_VALUE));
if (missingValuesAccordingToSort == false) return null;

// check for multiple values
if (PointValues.size(reader, fieldName) != PointValues.getDocCount(reader, fieldName)) return null; //TODO: handle multiple values
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should work if the multi-valued sort mode is set to max when you sort in reverse order (that's the default) and min if you sort in natural order (that's the default too). You can check the SortedNumericSortField to find these values.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is already large, maybe we should keep making this optimization work for multi-valued fields as a follow-up?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure


// check if the optimization makes sense with the track_total_hits setting
if (searchContext.trackTotalHitsUpTo() == Integer.MAX_VALUE) {
// with filter, we can't pre-calculate hitsCount, we need to explicitly calculate them => optimization does't make sense
if (hasFilterCollector) return null;
// if we can't pre-calculate hitsCount based on the query type, optimization does't make sense
if (shortcutTotalHitCount(reader, query) == -1) return null;
}

byte[] minValueBytes = PointValues.getMinPackedValue(reader, fieldName);
byte[] maxValueBytes = PointValues.getMaxPackedValue(reader, fieldName);
if ((maxValueBytes == null) || (minValueBytes == null)) return null;
long minValue = LongPoint.decodeDimension(minValueBytes, 0);
long maxValue = LongPoint.decodeDimension(maxValueBytes, 0);

Query rewrittenQuery;
if (minValue == maxValue) {
rewrittenQuery = new DocValuesFieldExistsQuery(fieldName);
} else {
long origin = (sortField.getReverse()) ? maxValue : minValue;
long pivotDistance = (maxValue - minValue) >>> 1; // division by 2 on the unsigned representation to avoid overflow
if (pivotDistance == 0) { // 0 if maxValue = (minValue + 1)
pivotDistance = 1;
}
rewrittenQuery = LongPoint.newDistanceFeatureQuery(sortField.getField(), 1, origin, pivotDistance);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's compute pivotDistance = (maxValue - minValue) >>> 1 to avoid the overflow entirely? (>>>1 is a division by 2 on the unsigned representation)

rewrittenQuery = new BooleanQuery.Builder()
.add(query, BooleanClause.Occur.FILTER) // filter for original query
.add(rewrittenQuery, BooleanClause.Occur.SHOULD) //should for rewrittenQuery
.build();
return rewrittenQuery;
}

// Restore fieldsDocs to remove the first _score sort
// updating in place without creating new FieldDoc objects
static void restoreTopFieldDocs(QuerySearchResult result, SortAndFormats originalSortAndFormats) {
TopDocs topDocs = result.topDocs().topDocs;
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
FieldDoc fieldDoc = (FieldDoc) scoreDoc;
fieldDoc.fields = Arrays.copyOfRange(fieldDoc.fields, 1, fieldDoc.fields.length);
}
TopFieldDocs newTopDocs = new TopFieldDocs(topDocs.totalHits, topDocs.scoreDocs, originalSortAndFormats.sort.getSort());
result.topDocs(new TopDocsAndMaxScore(newTopDocs, Float.NaN), originalSortAndFormats.formats);
}

/**
* Returns true if the provided <code>query</code> returns docs in index order (internal doc ids).
* @param query The query to execute
Expand Down
Loading