Skip to content

Commit

Permalink
Adding numeric optimization support for all numeric types
Browse files Browse the repository at this point in the history
Signed-off-by: gashutos <gashutos@amazon.com>
  • Loading branch information
gashutos committed Feb 21, 2023
1 parent 4316f96 commit dfde04a
Show file tree
Hide file tree
Showing 5 changed files with 380 additions and 122 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Fix timeout error when adding a document to an index with extension running ([#6275](https://github.com/opensearch-project/OpenSearch/pull/6275))
- Handle translog upload during primary relocation for remote-backed indexes ([#5804](https://github.com/opensearch-project/OpenSearch/pull/5804))
- Batch translog sync/upload per x ms for remote-backed indexes ([#5854](https://github.com/opensearch-project/OpenSearch/pull/5854))
- Enable sort optimization for all NumericTypes ([#6321](https://github.com/opensearch-project/OpenSearch/pull/6321)

### Dependencies
- Update nebula-publishing-plugin to 19.2.0 ([#5704](https://github.com/opensearch-project/OpenSearch/pull/5704))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,22 @@
import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.ObjectObjectHashMap;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.search.TotalHits.Relation;
import org.apache.lucene.search.comparators.NumericComparator;
import org.apache.lucene.search.grouping.CollapseTopFieldDocs;
import org.opensearch.common.breaker.CircuitBreaker;
import org.opensearch.common.collect.HppcMaps;
Expand All @@ -72,6 +77,7 @@
import org.opensearch.search.suggest.Suggest.Suggestion;
import org.opensearch.search.suggest.completion.CompletionSuggestion;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
Expand Down Expand Up @@ -237,11 +243,13 @@ static TopDocs mergeTopDocs(Collection<TopDocs> results, int topN, int from) {
} else if (topDocs instanceof CollapseTopFieldDocs) {
CollapseTopFieldDocs firstTopDocs = (CollapseTopFieldDocs) topDocs;
final Sort sort = new Sort(firstTopDocs.fields);
applySortFieldWidening(sort);
final CollapseTopFieldDocs[] shardTopDocs = results.toArray(new CollapseTopFieldDocs[numShards]);
mergedTopDocs = CollapseTopFieldDocs.merge(sort, from, topN, shardTopDocs, false);
} else if (topDocs instanceof TopFieldDocs) {
TopFieldDocs firstTopDocs = (TopFieldDocs) topDocs;
final Sort sort = new Sort(firstTopDocs.fields);
applySortFieldWidening(sort);
final TopFieldDocs[] shardTopDocs = results.toArray(new TopFieldDocs[numShards]);
mergedTopDocs = TopDocs.merge(sort, from, topN, shardTopDocs);
} else {
Expand Down Expand Up @@ -600,6 +608,94 @@ private static void validateMergeSortValueFormats(Collection<? extends SearchPha
}
}

/**
* It is necessary to widen the SortField.Type to maximum byte size for merging sorted docs.
* Different indices might have different types. This will avoid user to do re-index of data
* in case of mapping field change for newly indexed data.
* This will support Int to Long and Float to Double.
* Earlier widening of type was taken care in IndexNumericFieldData, but since we now want to
* support sort optimization, we removed type widening there and taking care here during merging.
* More details here https://github.com/opensearch-project/OpenSearch/issues/6326
* @param sort
*/
private static void applySortFieldWidening(Sort sort) {
for (int i = 0; i < sort.getSort().length; i++) {
if (sort.getSort()[i] instanceof SortedNumericSortField) {
final SortedNumericSortField delegate = (SortedNumericSortField) sort.getSort()[i];
switch (delegate.getNumericType()) {
case INT:
case LONG:
sort.getSort()[i] = getWidenedSortField(delegate, SortField.Type.LONG, Long.BYTES);
break;
case FLOAT:
case DOUBLE:
sort.getSort()[i] = getWidenedSortField(delegate, SortField.Type.DOUBLE, Double.BYTES);
break;
default:
// No action required
// As of now lucene support only two 8 byte size Long & Double, which is covered above
}
}
}
}

/**
* Retrieves Sort field with widened ComparValue() logic.
* It will always compare with maximum byte size (Long/Double)
* @param delegate
* @param type
* @param bytes
* @return
*/
private static SortedNumericSortField getWidenedSortField(SortedNumericSortField delegate, SortField.Type type, int bytes) {
return new SortedNumericSortField(delegate.getField(), type, delegate.getReverse()) {
@Override
public FieldComparator<?> getComparator(int numHits, boolean enableSkipping) {
return new NumericComparator<Number>(
delegate.getField(),
(Number) delegate.getMissingValue(),
delegate.getReverse(),
enableSkipping,
bytes
) {
@Override
public int compare(int slot1, int slot2) {
throw new UnsupportedOperationException();
}

@Override
public Number value(int slot) {
throw new UnsupportedOperationException();
}

@Override
public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
throw new UnsupportedOperationException();
}

@Override
public int compareValues(Number first, Number second) {
if (first == null) {
if (second == null) {
return 0;
} else {
return -1;
}
} else if (second == null) {
return 1;
} else {
if (type == Type.LONG) {
return Long.compare(first.longValue(), second.longValue());
} else {
return Double.compare(first.doubleValue(), second.doubleValue());
}
}
}
};
}
};
}

/*
* Returns the size of the requested top documents (from + size)
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import org.opensearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
import org.opensearch.index.fielddata.fieldcomparator.DoubleValuesComparatorSource;
import org.opensearch.index.fielddata.fieldcomparator.FloatValuesComparatorSource;
import org.opensearch.index.fielddata.fieldcomparator.IntValuesComparatorSource;
import org.opensearch.index.fielddata.fieldcomparator.LongValuesComparatorSource;
import org.opensearch.search.DocValueFormat;
import org.opensearch.search.MultiValueMode;
Expand All @@ -65,76 +66,16 @@ public abstract class IndexNumericFieldData implements IndexFieldData<LeafNumeri
* @opensearch.internal
*/
public enum NumericType {
BOOLEAN(false, SortField.Type.LONG, CoreValuesSourceType.BOOLEAN) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.DISABLED;
}
},
BYTE(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.DISABLED;
}
},
SHORT(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.DISABLED;
}
},
INT(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.DISABLED;
}
},
LONG(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.ENABLED;
}
},
DATE(false, SortField.Type.LONG, CoreValuesSourceType.DATE) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.ENABLED;
}
},
DATE_NANOSECONDS(false, SortField.Type.LONG, CoreValuesSourceType.DATE) {
@Deprecated
@Override
public PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.ENABLED;
}
},
HALF_FLOAT(true, SortField.Type.LONG, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.DISABLED;
}
},
FLOAT(true, SortField.Type.FLOAT, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.DISABLED;
}
},
DOUBLE(true, SortField.Type.DOUBLE, CoreValuesSourceType.NUMERIC) {
@Deprecated
@Override
protected PointSortOptimization applyPointSortOptimization() {
return PointSortOptimization.ENABLED;
}
};
BOOLEAN(false, SortField.Type.INT, CoreValuesSourceType.BOOLEAN),
BYTE(false, SortField.Type.INT, CoreValuesSourceType.NUMERIC),
SHORT(false, SortField.Type.INT, CoreValuesSourceType.NUMERIC),
INT(false, SortField.Type.INT, CoreValuesSourceType.NUMERIC),
LONG(false, SortField.Type.LONG, CoreValuesSourceType.NUMERIC),
DATE(false, SortField.Type.LONG, CoreValuesSourceType.DATE),
DATE_NANOSECONDS(false, SortField.Type.LONG, CoreValuesSourceType.DATE),
HALF_FLOAT(true, SortField.Type.LONG, CoreValuesSourceType.NUMERIC),
FLOAT(true, SortField.Type.FLOAT, CoreValuesSourceType.NUMERIC),
DOUBLE(true, SortField.Type.DOUBLE, CoreValuesSourceType.NUMERIC);

private final boolean floatingPoint;
private final ValuesSourceType valuesSourceType;
Expand All @@ -153,24 +94,6 @@ public final boolean isFloatingPoint() {
public final ValuesSourceType getValuesSourceType() {
return valuesSourceType;
}

@Deprecated
protected abstract PointSortOptimization applyPointSortOptimization();
}

/**
* Controls whether to apply sort optimization to skip non-competitive docs
* based on the BKD index.
*
* @deprecated this control will be removed in a future version of OpenSearch
*
* @opensearch.internal
* @opensearch.experimental
*/
@Deprecated
private enum PointSortOptimization {
ENABLED,
DISABLED
}

/**
Expand Down Expand Up @@ -211,21 +134,6 @@ public final SortField sortField(
: SortedNumericSelector.Type.MIN;
SortField sortField = new SortedNumericSortField(getFieldName(), getNumericType().sortFieldType, reverse, selectorType);
sortField.setMissingValue(source.missingObject(missingValue, reverse));

// LUCENE-9280 added the ability for collectors to skip non-competitive
// documents when top docs are sorted by other fields different from the _score.
// However, from Lucene 9 onwards, numeric sort optimisation requires the byte size
// for points (BKD index) and doc values (columnar) and SortField.Type to be matched.
// NumericType violates this requirement
// (see: https://github.com/opensearch-project/OpenSearch/issues/2063#issuecomment-1069358826 test failure)
// because it uses the largest byte size (LONG) for the SortField of most types. The section below disables
// the BKD based sort optimization for numeric types whose encoded BYTE size does not match the comparator (LONG)/
// So as of now, we can only enable for DATE, DATE_NANOSECONDS, LONG, DOUBLE.
// BOOLEAN, BYTE, SHORT, INT, HALF_FLOAT, FLOAT (use long for doc values, but fewer for BKD Points)
// todo : Enable other SortField.Type as well, that will require wider change
if (getNumericType().applyPointSortOptimization() == PointSortOptimization.DISABLED) {
sortField.setOptimizeSortWithPoints(false);
}
return sortField;
}

Expand Down Expand Up @@ -298,9 +206,12 @@ private XFieldComparatorSource comparatorSource(
return dateComparatorSource(missingValue, sortMode, nested);
case DATE_NANOSECONDS:
return dateNanosComparatorSource(missingValue, sortMode, nested);
default:
case LONG:
assert !targetNumericType.isFloatingPoint();
return new LongValuesComparatorSource(this, missingValue, sortMode, nested);
default:
assert !targetNumericType.isFloatingPoint();
return new IntValuesComparatorSource(this, missingValue, sortMode, nested);
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.index.fielddata.fieldcomparator;

import org.apache.lucene.search.SortField;
import org.opensearch.common.Nullable;
import org.opensearch.index.fielddata.IndexNumericFieldData;
import org.opensearch.search.MultiValueMode;

/**
* Comparator source for int values.
*
* @opensearch.internal
*/
public class IntValuesComparatorSource extends LongValuesComparatorSource {

public IntValuesComparatorSource(
IndexNumericFieldData indexFieldData,
@Nullable Object missingValue,
MultiValueMode sortMode,
Nested nested
) {
super(indexFieldData, missingValue, sortMode, nested);
}

@Override
public SortField.Type reducedType() {
return SortField.Type.INT;
}
}
Loading

0 comments on commit dfde04a

Please sign in to comment.