Skip to content

Commit ee0ce8f

Browse files
authored
Fix a bug with missing fields in sig_terms (#57757)
When you run a `significant_terms` aggregation on a field and it *is* mapped but there aren't any values for it then the count of the documents that match the query on that shard still have to be added to the overall doc count. I broke that in #57361. This fixes that. Closes #57402
1 parent 70e63a3 commit ee0ce8f

File tree

5 files changed

+182
-8
lines changed

5 files changed

+182
-8
lines changed

rest-api-spec/src/main/resources/rest-api-spec/test/msearch/20_typed_keys.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,6 @@ setup:
9090

9191
---
9292
"Multisearch test with typed_keys parameter for sampler and significant terms":
93-
- skip:
94-
version: "all"
95-
reason: "AwaitsFix https://github.com/elastic/elasticsearch/issues/57402"
9693
- do:
9794
msearch:
9895
rest_total_hits_as_int: true

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractStringTermsAggregator.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,12 @@ protected StringTerms buildEmptyTermsAggregation() {
4949
metadata(), format, bucketCountThresholds.getShardSize(), showTermDocCountError, 0, emptyList(), 0);
5050
}
5151

52-
protected SignificantStringTerms buildEmptySignificantTermsAggregation(SignificanceHeuristic significanceHeuristic) {
52+
protected SignificantStringTerms buildEmptySignificantTermsAggregation(long subsetSize, SignificanceHeuristic significanceHeuristic) {
5353
// We need to account for the significance of a miss in our global stats - provide corpus size as context
5454
ContextIndexSearcher searcher = context.searcher();
5555
IndexReader topReader = searcher.getIndexReader();
5656
int supersetSize = topReader.numDocs();
5757
return new SignificantStringTerms(name, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getMinDocCount(),
58-
metadata(), format, 0, supersetSize, significanceHeuristic, emptyList());
58+
metadata(), format, subsetSize, supersetSize, significanceHeuristic, emptyList());
5959
}
6060
}

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -767,7 +767,7 @@ SignificantStringTerms buildResult(SignificantStringTerms.Bucket[] topBuckets, l
767767

768768
@Override
769769
SignificantStringTerms buildEmptyResult() {
770-
return buildEmptySignificantTermsAggregation(significanceHeuristic);
770+
return buildEmptySignificantTermsAggregation(subsetSize, significanceHeuristic);
771771
}
772772

773773
@Override

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,7 @@ SignificantStringTerms buildResult(SignificantStringTerms.Bucket[] topBuckets, l
435435

436436
@Override
437437
SignificantStringTerms buildEmptyResult() {
438-
return buildEmptySignificantTermsAggregation(significanceHeuristic);
438+
return buildEmptySignificantTermsAggregation(subsetSize, significanceHeuristic);
439439
}
440440

441441
@Override

server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorTests.java

Lines changed: 178 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.apache.lucene.document.BinaryDocValuesField;
2424
import org.apache.lucene.document.Document;
2525
import org.apache.lucene.document.Field;
26+
import org.apache.lucene.document.SortedDocValuesField;
2627
import org.apache.lucene.document.SortedNumericDocValuesField;
2728
import org.apache.lucene.document.StoredField;
2829
import org.apache.lucene.index.DirectoryReader;
@@ -146,8 +147,9 @@ public void testSignificance() throws IOException {
146147
IndexSearcher searcher = new IndexSearcher(reader);
147148

148149
// Search "odd"
149-
SignificantTerms terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
150+
SignificantStringTerms terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
150151

152+
assertThat(terms.getSubsetSize(), equalTo(5L));
151153
assertEquals(1, terms.getBuckets().size());
152154
assertNull(terms.getBucketByKey("even"));
153155
assertNull(terms.getBucketByKey("common"));
@@ -156,6 +158,7 @@ public void testSignificance() throws IOException {
156158
// Search even
157159
terms = searchAndReduce(searcher, new TermQuery(new Term("text", "even")), sigAgg, textFieldType);
158160

161+
assertThat(terms.getSubsetSize(), equalTo(5L));
159162
assertEquals(1, terms.getBuckets().size());
160163
assertNull(terms.getBucketByKey("odd"));
161164
assertNull(terms.getBucketByKey("common"));
@@ -164,6 +167,7 @@ public void testSignificance() throws IOException {
164167
// Search odd with regex includeexcludes
165168
sigAgg.includeExclude(new IncludeExclude("o.d", null));
166169
terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
170+
assertThat(terms.getSubsetSize(), equalTo(5L));
167171
assertEquals(1, terms.getBuckets().size());
168172
assertNotNull(terms.getBucketByKey("odd"));
169173
assertNull(terms.getBucketByKey("common"));
@@ -176,6 +180,7 @@ public void testSignificance() throws IOException {
176180
sigAgg.includeExclude(new IncludeExclude(oddStrings, evenStrings));
177181
sigAgg.significanceHeuristic(SignificanceHeuristicTests.getRandomSignificanceheuristic());
178182
terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
183+
assertThat(terms.getSubsetSize(), equalTo(5L));
179184
assertEquals(1, terms.getBuckets().size());
180185
assertNotNull(terms.getBucketByKey("odd"));
181186
assertNull(terms.getBucketByKey("weird"));
@@ -185,6 +190,7 @@ public void testSignificance() throws IOException {
185190

186191
sigAgg.includeExclude(new IncludeExclude(evenStrings, oddStrings));
187192
terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
193+
assertThat(terms.getSubsetSize(), equalTo(5L));
188194
assertEquals(0, terms.getBuckets().size());
189195
assertNull(terms.getBucketByKey("odd"));
190196
assertNull(terms.getBucketByKey("weird"));
@@ -240,13 +246,15 @@ public void testNumericSignificance() throws IOException {
240246
// Search "odd"
241247
SignificantLongTerms terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigNumAgg, longFieldType);
242248
assertEquals(1, terms.getBuckets().size());
249+
assertThat(terms.getSubsetSize(), equalTo(5L));
243250

244251
assertNull(terms.getBucketByKey(Long.toString(EVEN_VALUE)));
245252
assertNull(terms.getBucketByKey(Long.toString(COMMON_VALUE)));
246253
assertNotNull(terms.getBucketByKey(Long.toString(ODD_VALUE)));
247254

248255
terms = searchAndReduce(searcher, new TermQuery(new Term("text", "even")), sigNumAgg, longFieldType);
249256
assertEquals(1, terms.getBuckets().size());
257+
assertThat(terms.getSubsetSize(), equalTo(5L));
250258

251259
assertNull(terms.getBucketByKey(Long.toString(ODD_VALUE)));
252260
assertNull(terms.getBucketByKey(Long.toString(COMMON_VALUE)));
@@ -379,6 +387,172 @@ public void testFieldAlias() throws IOException {
379387
}
380388
}
381389

390+
public void testAllDocsWithoutStringFieldviaGlobalOrds() throws IOException {
391+
testAllDocsWithoutStringField("global_ordinals");
392+
}
393+
394+
public void testAllDocsWithoutStringFieldViaMap() throws IOException {
395+
testAllDocsWithoutStringField("map");
396+
}
397+
398+
/**
399+
* Make sure that when the field is mapped but there aren't any values
400+
* for it we return a properly shaped "empty" result. In particular, the
401+
* {@link InternalMappedSignificantTerms#getSubsetSize()} needs to be set
402+
* to the number of matching documents.
403+
*/
404+
private void testAllDocsWithoutStringField(String executionHint) throws IOException {
405+
try (Directory dir = newDirectory()) {
406+
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
407+
writer.addDocument(new Document());
408+
try (IndexReader reader = maybeWrapReaderEs(writer.getReader())) {
409+
IndexSearcher searcher = newIndexSearcher(reader);
410+
SignificantTermsAggregationBuilder request = new SignificantTermsAggregationBuilder("f").field("f")
411+
.executionHint(executionHint);
412+
SignificantStringTerms result = search(searcher, new MatchAllDocsQuery(), request, keywordField("f"));
413+
assertThat(result.getSubsetSize(), equalTo(1L));
414+
}
415+
}
416+
}
417+
}
418+
419+
/**
420+
* Make sure that when the field is mapped but there aren't any values
421+
* for it we return a properly shaped "empty" result. In particular, the
422+
* {@link InternalMappedSignificantTerms#getSubsetSize()} needs to be set
423+
* to the number of matching documents.
424+
*/
425+
public void testAllDocsWithoutNumericField() throws IOException {
426+
try (Directory dir = newDirectory()) {
427+
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
428+
writer.addDocument(new Document());
429+
try (IndexReader reader = maybeWrapReaderEs(writer.getReader())) {
430+
IndexSearcher searcher = newIndexSearcher(reader);
431+
SignificantTermsAggregationBuilder request = new SignificantTermsAggregationBuilder("f").field("f");
432+
SignificantLongTerms result = search(searcher, new MatchAllDocsQuery(), request, longField("f"));
433+
assertThat(result.getSubsetSize(), equalTo(1L));
434+
}
435+
}
436+
}
437+
}
438+
439+
public void testSomeDocsWithoutStringFieldviaGlobalOrds() throws IOException {
440+
testSomeDocsWithoutStringField("global_ordinals");
441+
}
442+
443+
public void testSomeDocsWithoutStringFieldViaMap() throws IOException {
444+
testSomeDocsWithoutStringField("map");
445+
}
446+
447+
/**
448+
* Make sure that when the field a segment doesn't contain the field we
449+
* still include the count of its matching documents
450+
* in {@link InternalMappedSignificantTerms#getSubsetSize()}.
451+
*/
452+
private void testSomeDocsWithoutStringField(String executionHint) throws IOException {
453+
try (Directory dir = newDirectory()) {
454+
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
455+
Document d = new Document();
456+
d.add(new SortedDocValuesField("f", new BytesRef("f")));
457+
writer.addDocument(d);
458+
writer.flush();
459+
writer.addDocument(new Document());
460+
try (IndexReader reader = maybeWrapReaderEs(writer.getReader())) {
461+
IndexSearcher searcher = newIndexSearcher(reader);
462+
SignificantTermsAggregationBuilder request = new SignificantTermsAggregationBuilder("f").field("f")
463+
.executionHint(executionHint);
464+
SignificantStringTerms result = search(searcher, new MatchAllDocsQuery(), request, keywordField("f"));
465+
assertThat(result.getSubsetSize(), equalTo(2L));
466+
}
467+
}
468+
}
469+
}
470+
471+
/**
472+
* Make sure that when the field a segment doesn't contain the field we
473+
* still include the count of its matching documents
474+
* in {@link InternalMappedSignificantTerms#getSubsetSize()}.
475+
*/
476+
public void testSomeDocsWithoutNumericField() throws IOException {
477+
try (Directory dir = newDirectory()) {
478+
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
479+
Document d = new Document();
480+
d.add(new SortedNumericDocValuesField("f", 1));
481+
writer.addDocument(d);
482+
writer.addDocument(new Document());
483+
try (IndexReader reader = maybeWrapReaderEs(writer.getReader())) {
484+
IndexSearcher searcher = newIndexSearcher(reader);
485+
SignificantTermsAggregationBuilder request = new SignificantTermsAggregationBuilder("f").field("f");
486+
SignificantLongTerms result = search(searcher, new MatchAllDocsQuery(), request, longField("f"));
487+
assertThat(result.getSubsetSize(), equalTo(2L));
488+
}
489+
}
490+
}
491+
}
492+
493+
public void testThreeLayerStringViaGlobalOrds() throws IOException {
494+
threeLayerStringTestCase("global_ordinals");
495+
}
496+
497+
public void testThreeLayerStringViaMap() throws IOException {
498+
threeLayerStringTestCase("map");
499+
}
500+
501+
private void threeLayerStringTestCase(String executionHint) throws IOException {
502+
try (Directory dir = newDirectory()) {
503+
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
504+
for (int i = 0; i < 10; i++) {
505+
for (int j = 0; j < 10; j++) {
506+
for (int k = 0; k < 10; k++) {
507+
Document d = new Document();
508+
d.add(new SortedDocValuesField("i", new BytesRef(Integer.toString(i))));
509+
d.add(new SortedDocValuesField("j", new BytesRef(Integer.toString(j))));
510+
d.add(new SortedDocValuesField("k", new BytesRef(Integer.toString(k))));
511+
writer.addDocument(d);
512+
}
513+
}
514+
}
515+
try (IndexReader reader = maybeWrapReaderEs(writer.getReader())) {
516+
IndexSearcher searcher = newIndexSearcher(reader);
517+
SignificantTermsAggregationBuilder kRequest = new SignificantTermsAggregationBuilder("k").field("k")
518+
.executionHint(executionHint);
519+
SignificantTermsAggregationBuilder jRequest = new SignificantTermsAggregationBuilder("j").field("j")
520+
.executionHint(executionHint)
521+
.subAggregation(kRequest);
522+
SignificantTermsAggregationBuilder request = new SignificantTermsAggregationBuilder("i").field("i")
523+
.executionHint(executionHint)
524+
.subAggregation(jRequest);
525+
SignificantStringTerms result = search(
526+
searcher,
527+
new MatchAllDocsQuery(),
528+
request,
529+
keywordField("i"),
530+
keywordField("j"),
531+
keywordField("k")
532+
);
533+
assertThat(result.getSubsetSize(), equalTo(1000L));
534+
for (int i = 0; i < 10; i++) {
535+
SignificantStringTerms.Bucket iBucket = result.getBucketByKey(Integer.toString(i));
536+
assertThat(iBucket.getDocCount(), equalTo(100L));
537+
SignificantStringTerms jAgg = iBucket.getAggregations().get("j");
538+
assertThat(jAgg.getSubsetSize(), equalTo(100L));
539+
for (int j = 0; j < 10; j++) {
540+
SignificantStringTerms.Bucket jBucket = jAgg.getBucketByKey(Integer.toString(j));
541+
assertThat(jBucket.getDocCount(), equalTo(10L));
542+
SignificantStringTerms kAgg = jBucket.getAggregations().get("k");
543+
assertThat(kAgg.getSubsetSize(), equalTo(10L));
544+
for (int k = 0; k < 10; k++) {
545+
SignificantStringTerms.Bucket kBucket = kAgg.getBucketByKey(Integer.toString(k));
546+
assertThat(jAgg.getSubsetSize(), equalTo(100L));
547+
assertThat(kBucket.getDocCount(), equalTo(1L));
548+
}
549+
}
550+
}
551+
}
552+
}
553+
}
554+
}
555+
382556
public void testThreeLayerLong() throws IOException {
383557
try (Directory dir = newDirectory()) {
384558
try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
@@ -400,14 +574,17 @@ public void testThreeLayerLong() throws IOException {
400574
.subAggregation(new SignificantTermsAggregationBuilder("k").field("k")));
401575
SignificantLongTerms result = search(searcher, new MatchAllDocsQuery(), request,
402576
longField("i"), longField("j"), longField("k"));
577+
assertThat(result.getSubsetSize(), equalTo(1000L));
403578
for (int i = 0; i < 10; i++) {
404579
SignificantLongTerms.Bucket iBucket = result.getBucketByKey(Integer.toString(i));
405580
assertThat(iBucket.getDocCount(), equalTo(100L));
406581
SignificantLongTerms jAgg = iBucket.getAggregations().get("j");
582+
assertThat(jAgg.getSubsetSize(), equalTo(100L));
407583
for (int j = 0; j < 10; j++) {
408584
SignificantLongTerms.Bucket jBucket = jAgg.getBucketByKey(Integer.toString(j));
409585
assertThat(jBucket.getDocCount(), equalTo(10L));
410586
SignificantLongTerms kAgg = jBucket.getAggregations().get("k");
587+
assertThat(kAgg.getSubsetSize(), equalTo(10L));
411588
for (int k = 0; k < 10; k++) {
412589
SignificantLongTerms.Bucket kBucket = kAgg.getBucketByKey(Integer.toString(k));
413590
assertThat(kBucket.getDocCount(), equalTo(1L));

0 commit comments

Comments
 (0)