Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid double term construction in DfsPhase #38716

Merged
merged 4 commits into from
Feb 15, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
package org.elasticsearch.example.rescore;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
Expand All @@ -46,7 +45,6 @@
import java.util.Arrays;
import java.util.Iterator;
import java.util.Objects;
import java.util.Set;

import static java.util.Collections.singletonList;
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.constructorArg;
Expand Down Expand Up @@ -225,7 +223,7 @@ public Explanation explain(int topLevelDocId, IndexSearcher searcher, RescoreCon
}

@Override
public void extractTerms(IndexSearcher searcher, RescoreContext rescoreContext, Set<Term> termsSet) {
public void extractTerms(IndexSearcher searcher, RescoreContext rescoreContext) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new signature is a bit weird, the only option is to call createWeight on the searcher but it's obfuscated so you need to check an actual implementation to realize that.

// Since we don't use queries there are no terms to extract.
}
}
Expand Down
124 changes: 35 additions & 89 deletions server/src/main/java/org/elasticsearch/search/dfs/DfsPhase.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,11 @@

package org.elasticsearch.search.dfs;

import com.carrotsearch.hppc.ObjectHashSet;
import com.carrotsearch.hppc.ObjectObjectHashMap;
import com.carrotsearch.hppc.cursors.ObjectCursor;

import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.TermStatistics;
import org.elasticsearch.common.collect.HppcMaps;
Expand All @@ -36,9 +33,8 @@
import org.elasticsearch.tasks.TaskCancelledException;

import java.io.IOException;
import java.util.AbstractSet;
import java.util.Collection;
import java.util.Iterator;
import java.util.HashMap;
import java.util.Map;

/**
* Dfs phase of a search request, used to make scoring 100% accurate by collecting additional info from each shard before the query phase.
Expand All @@ -52,101 +48,51 @@ public void preProcess(SearchContext context) {

@Override
public void execute(SearchContext context) {
final ObjectHashSet<Term> termsSet = new ObjectHashSet<>();
try {
context.searcher().createWeight(context.searcher().rewrite(context.query()), ScoreMode.COMPLETE, 1f)
.extractTerms(new DelegateSet(termsSet));
for (RescoreContext rescoreContext : context.rescore()) {
try {
rescoreContext.rescorer().extractTerms(context.searcher(), rescoreContext, new DelegateSet(termsSet));
} catch (IOException e) {
throw new IllegalStateException("Failed to extract terms", e);
}
}

Term[] terms = termsSet.toArray(Term.class);
TermStatistics[] termStatistics = new TermStatistics[terms.length];
IndexReaderContext indexReaderContext = context.searcher().getTopReaderContext();
for (int i = 0; i < terms.length; i++) {
if(context.isCancelled()) {
throw new TaskCancelledException("cancelled");
}
// LUCENE 4 UPGRADE: cache TermStates?
TermStates termContext = TermStates.build(indexReaderContext, terms[i], true);
termStatistics[i] = context.searcher().termStatistics(terms[i], termContext);
}

ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics = HppcMaps.newNoNullKeysMap();
for (Term term : terms) {
assert term.field() != null : "field is null";
if (fieldStatistics.containsKey(term.field()) == false) {
final CollectionStatistics collectionStatistics = context.searcher().collectionStatistics(term.field());
if (collectionStatistics != null) {
fieldStatistics.put(term.field(), collectionStatistics);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this is really equivalent. Some queries are going to build term statistics even though they don't add terms in extractTerms for various reasons (GlobalOrdinalsQuery for instance) so we'll end up with more terms than before. However the good side of this change is that we'd extract only the terms that are used for scoring instead of all terms that are present in the query.
The API change for the rescorer is too obfuscated IMO but one thing I fully agree with is that we don't need the special map so we could rely on plain HashMap to cleanup the code a bit ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some queries are going to build term statistics even though they don't add terms in extractTerms for various reasons (GlobalOrdinalsQuery for instance)

I think this is incorrect - queries will only build term statistics if they actually need them, whereas extractTerms doesn't always track that. TermWeight for example always adds its term, even if it has been created with ScoreMode.COMPLETE_NO_SCORES, while GlobalOrdinalsQuery creates its child weight with COMPLETE_NO_SCORES so no term stats will be pulled here.

I agree about the API change on rescorer though, let me think about a better way to do that.

Map<Term, TermStatistics> stats = new HashMap<>();
IndexSearcher searcher = new IndexSearcher(context.searcher().getIndexReader()) {
@Override
public TermStatistics termStatistics(Term term, TermStates states) throws IOException {
if (context.isCancelled()) {
throw new TaskCancelledException("cancelled");
}
if(context.isCancelled()) {
TermStatistics ts = super.termStatistics(term, states);
if (ts != null) {
stats.put(term, ts);
}
return ts;
}

@Override
public CollectionStatistics collectionStatistics(String field) throws IOException {
if (context.isCancelled()) {
throw new TaskCancelledException("cancelled");
}
CollectionStatistics cs = super.collectionStatistics(field);
if (cs != null) {
fieldStatistics.put(field, cs);
}
return cs;
}
};

searcher.createWeight(context.searcher().rewrite(context.query()), ScoreMode.COMPLETE, 1);
for (RescoreContext rescoreContext : context.rescore()) {
rescoreContext.rescorer().extractTerms(searcher, rescoreContext);
}

Term[] terms = stats.keySet().toArray(new Term[0]);
TermStatistics[] termStatistics = new TermStatistics[terms.length];
for (int i = 0; i < terms.length; i++) {
termStatistics[i] = stats.get(terms[i]);
}

context.dfsResult().termsStatistics(terms, termStatistics)
.fieldStatistics(fieldStatistics)
.maxDoc(context.searcher().getIndexReader().maxDoc());
} catch (Exception e) {
throw new DfsPhaseExecutionException(context, "Exception during dfs phase", e);
} finally {
termsSet.clear(); // don't hold on to terms
}
}

// We need to bridge to JCF world, b/c of Query#extractTerms
private static class DelegateSet extends AbstractSet<Term> {

private final ObjectHashSet<Term> delegate;

private DelegateSet(ObjectHashSet<Term> delegate) {
this.delegate = delegate;
}

@Override
public boolean add(Term term) {
return delegate.add(term);
}

@Override
public boolean addAll(Collection<? extends Term> terms) {
boolean result = false;
for (Term term : terms) {
result = delegate.add(term);
}
return result;
}

@Override
public Iterator<Term> iterator() {
final Iterator<ObjectCursor<Term>> iterator = delegate.iterator();
return new Iterator<Term>() {
@Override
public boolean hasNext() {
return iterator.hasNext();
}

@Override
public Term next() {
return iterator.next().value;
}

@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}

@Override
public int size() {
return delegate.size();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

package org.elasticsearch.search.rescore;

import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
Expand All @@ -29,9 +28,10 @@

import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Set;
import java.util.Collections;

import static java.util.stream.Collectors.toSet;

public final class QueryRescorer implements Rescorer {
Expand Down Expand Up @@ -204,9 +204,9 @@ public void setScoreMode(String scoreMode) {
}

@Override
public void extractTerms(IndexSearcher searcher, RescoreContext rescoreContext, Set<Term> termsSet) throws IOException {
public void extractTerms(IndexSearcher searcher, RescoreContext rescoreContext) throws IOException {
Query query = ((QueryRescoreContext) rescoreContext).query();
searcher.createWeight(searcher.rewrite(query), ScoreMode.COMPLETE_NO_SCORES, 1f).extractTerms(termsSet);
searcher.createWeight(searcher.rewrite(query), ScoreMode.COMPLETE, 1f);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,12 @@

package org.elasticsearch.search.rescore;

import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TopDocs;
import org.elasticsearch.action.search.SearchType;

import java.io.IOException;
import java.util.Set;

/**
* A query rescorer interface used to re-rank the Top-K results of a previously
Expand Down Expand Up @@ -66,5 +64,5 @@ Explanation explain(int topLevelDocId, IndexSearcher searcher, RescoreContext re
* is executed in a distributed frequency collection roundtrip for
* {@link SearchType#DFS_QUERY_THEN_FETCH}
*/
void extractTerms(IndexSearcher searcher, RescoreContext rescoreContext, Set<Term> termsSet) throws IOException;
void extractTerms(IndexSearcher searcher, RescoreContext rescoreContext) throws IOException;
}