diff --git a/dao-impl/elasticsearch-dao/src/main/java/com/linkedin/metadata/dao/search/ESSearchDAO.java b/dao-impl/elasticsearch-dao/src/main/java/com/linkedin/metadata/dao/search/ESSearchDAO.java index fd1558c20..42a99cb1a 100644 --- a/dao-impl/elasticsearch-dao/src/main/java/com/linkedin/metadata/dao/search/ESSearchDAO.java +++ b/dao-impl/elasticsearch-dao/src/main/java/com/linkedin/metadata/dao/search/ESSearchDAO.java @@ -55,13 +55,13 @@ @Slf4j public class ESSearchDAO extends BaseSearchDAO { - private static final Integer DEFAULT_TERM_BUCKETS_SIZE_100 = 100; private static final String URN_FIELD = "urn"; private RestHighLevelClient _client; private BaseSearchConfig _config; private BaseESAutoCompleteQuery _autoCompleteQueryForLowCardFields; private BaseESAutoCompleteQuery _autoCompleteQueryForHighCardFields; + private int _maxTermBucketSize = 100; // TODO: Currently takes elastic search client, in future, can take other clients such as galene // TODO: take params and settings needed to create the client @@ -254,7 +254,7 @@ private void buildAggregations(@Nonnull SearchSourceBuilder searchSourceBuilder, @Nullable Filter filter) { Set facetFields = _config.getFacetFields(); for (String facet : facetFields) { - AggregationBuilder aggBuilder = AggregationBuilders.terms(facet).field(facet).size(DEFAULT_TERM_BUCKETS_SIZE_100); + AggregationBuilder aggBuilder = AggregationBuilders.terms(facet).field(facet).size(_maxTermBucketSize); Optional.ofNullable(filter).map(Filter::getCriteria).ifPresent(criteria -> { for (Criterion criterion : criteria) { if (!facetFields.contains(criterion.getField()) || criterion.getField().equals(facet)) { @@ -446,4 +446,16 @@ private Urn getUrnFromSearchHit(@Nonnull SearchHit hit) { throw new RuntimeException("Invalid urn in search document " + e); } } + + /** + * Sets max term bucket size in the aggregation results. + * + *

The default value might not always be good enough when aggregation happens on a high cardinality field. + * Using a high default instead is also not ideal because of potential query performance degradation. + * Instead, entities which have a rare use case of aggregating over high cardinality fields can use this method + * to configure the aggregation behavior. + */ + public void setMaxTermBucketSize(int maxTermBucketSize) { + _maxTermBucketSize = maxTermBucketSize; + } } diff --git a/dao-impl/elasticsearch-dao/src/test/java/com/linkedin/metadata/dao/search/ESSearchDAOTest.java b/dao-impl/elasticsearch-dao/src/test/java/com/linkedin/metadata/dao/search/ESSearchDAOTest.java index 946321e64..e985020c5 100644 --- a/dao-impl/elasticsearch-dao/src/test/java/com/linkedin/metadata/dao/search/ESSearchDAOTest.java +++ b/dao-impl/elasticsearch-dao/src/test/java/com/linkedin/metadata/dao/search/ESSearchDAOTest.java @@ -28,6 +28,7 @@ import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHits; +import org.elasticsearch.search.aggregations.AggregationBuilders; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; @@ -212,6 +213,23 @@ public void testPreferenceInSearchQuery() { assertEquals(searchRequest.preference(), preference); } + @Test + public void testSetMaxTermBucketSize() { + String facetFieldName = "value"; + Filter filter = QueryUtils.newFilter(Collections.singletonMap(facetFieldName, "dummy")); + + // Default max term bucket size + SearchRequest searchRequest = _searchDAO.constructSearchQuery("dummy", filter, null, null, 0, 10); + assertEquals(searchRequest.source().aggregations().getAggregatorFactories().get(0), + AggregationBuilders.terms(facetFieldName).field(facetFieldName).size(100)); + + // Modified max term bucket size + _searchDAO.setMaxTermBucketSize(5); + searchRequest = _searchDAO.constructSearchQuery("dummy", filter, null, null, 0, 10); + assertEquals(searchRequest.source().aggregations().getAggregatorFactories().get(0), + AggregationBuilders.terms(facetFieldName).field(facetFieldName).size(5)); + } + private static SearchHit makeSearchHit(int id) { SearchHit hit = mock(SearchHit.class); Map sourceMap = new HashMap<>(); diff --git a/dao-impl/elasticsearch-dao/src/test/java/com/linkedin/metadata/dao/search/TestSearchConfig.java b/dao-impl/elasticsearch-dao/src/test/java/com/linkedin/metadata/dao/search/TestSearchConfig.java index c9558b91e..e56fe2dac 100644 --- a/dao-impl/elasticsearch-dao/src/test/java/com/linkedin/metadata/dao/search/TestSearchConfig.java +++ b/dao-impl/elasticsearch-dao/src/test/java/com/linkedin/metadata/dao/search/TestSearchConfig.java @@ -2,7 +2,6 @@ import com.linkedin.testing.EntityDocument; import java.util.Collections; -import java.util.HashSet; import java.util.Set; import javax.annotation.Nonnull; @@ -10,7 +9,7 @@ public class TestSearchConfig extends BaseSearchConfig { @Override @Nonnull public Set getFacetFields() { - return Collections.unmodifiableSet(new HashSet<>()); + return Collections.singleton("value"); } @Override diff --git a/testing/test-models/src/main/pegasus/com/linkedin/testing/EntityDocument.pdl b/testing/test-models/src/main/pegasus/com/linkedin/testing/EntityDocument.pdl index 92717ec55..f71a1dc68 100644 --- a/testing/test-models/src/main/pegasus/com/linkedin/testing/EntityDocument.pdl +++ b/testing/test-models/src/main/pegasus/com/linkedin/testing/EntityDocument.pdl @@ -11,4 +11,9 @@ record EntityDocument { * For unit tests */ urn: Urn + + /** + * For unit tests + */ + value: optional string } \ No newline at end of file