Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Grouping on arrays as arrays #12078

Merged
merged 42 commits into from
Jan 26, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
2265991
init multiValue column group by
cryptoe Dec 7, 2021
fe3c9d9
Changing sorting to Lexicographic as default
cryptoe Dec 7, 2021
d50cdfb
Adding initial tests
cryptoe Dec 15, 2021
ba28913
1.Fixing test cases adding
cryptoe Dec 17, 2021
6cd8b65
Linking SQL layer to native layer
cryptoe Dec 20, 2021
23a44a4
Adding multiDimension support to group by column strategy
cryptoe Dec 21, 2021
e44258b
Merge branch 'master' into group_by_arrays
cryptoe Dec 21, 2021
5b6fa68
1. Removing array coercion in Calcite layer
cryptoe Jan 5, 2022
3ca3aa0
1. Supporting all primitive array types
cryptoe Jan 6, 2022
80ff204
1. Supporting all primitive array types
cryptoe Jan 6, 2022
aeba7c5
1. Checkstyle things
cryptoe Jan 6, 2022
7bb3df0
Minor naming things
cryptoe Jan 6, 2022
a4b7954
Merge branch 'master' of github.com:apache/druid into group_by_arrays
cryptoe Jan 7, 2022
6e3f806
CheckStyle Things
cryptoe Jan 7, 2022
c6db303
Fixing test case
cryptoe Jan 7, 2022
b8fe780
Fixing hashing
cryptoe Jan 7, 2022
985fd38
1. Adding the MV function
cryptoe Jan 7, 2022
2d4f01d
1. Adding MV function test cases
cryptoe Jan 8, 2022
1e3129f
Adding Selector strategy function test cases
cryptoe Jan 9, 2022
6dc2961
Fixing ClientQuerySegmentWalkerTest
cryptoe Jan 9, 2022
df5210e
Adding GroupByQueryRunnerTest test cases
cryptoe Jan 10, 2022
607fd54
Fixing test cases
cryptoe Jan 10, 2022
79246e0
Adding few more test cases
cryptoe Jan 10, 2022
2eace6a
Fixing Exception asset statement and intellij inspection
cryptoe Jan 10, 2022
b8711ca
Adding null compatibility tests
cryptoe Jan 10, 2022
1ba5549
Review comments
cryptoe Jan 11, 2022
2d676ed
Fixing few failing tests
cryptoe Jan 11, 2022
ce87461
Merge branch 'master' of github.com:apache/druid into group_by_arrays
cryptoe Jan 12, 2022
40b877e
Fixing few failing tests
cryptoe Jan 13, 2022
57dcbe1
Do no convert to topN Q incase of group by on array
cryptoe Jan 13, 2022
86fb48b
Fixing checkstyle
cryptoe Jan 13, 2022
8619964
Fixing differences between jdk's class cast exception message
cryptoe Jan 14, 2022
63081d3
1. Fixing ordering if the grouping key is an array
cryptoe Jan 14, 2022
97350a5
Fixing DefaultLimitSpec
cryptoe Jan 14, 2022
9b21150
Fixing CalciteArraysQueryTest
cryptoe Jan 15, 2022
ea5ba15
Merge branch 'master' of github.com:apache/druid into group_by_arrays
cryptoe Jan 17, 2022
1a6927f
Dummy commit for LGTM
cryptoe Jan 17, 2022
ec07ee7
changes:
clintropolis Jan 20, 2022
1156180
Review comments
cryptoe Jan 25, 2022
d00e0b2
Fixing test cases
cryptoe Jan 25, 2022
169f6e2
Fixing spot bugs
cryptoe Jan 25, 2022
e4e8d13
Fixing strict compile
cryptoe Jan 25, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ public class NullHandling
public static final Long ZERO_LONG = 0L;
public static final byte IS_NULL_BYTE = (byte) 1;
public static final byte IS_NOT_NULL_BYTE = (byte) 0;
public static final String[] NULL_STRING_ARRAY = new String[0];

/**
* INSTANCE is injected using static injection to avoid adding JacksonInject annotations all over the code.
Expand Down Expand Up @@ -118,6 +119,11 @@ public static Double defaultDoubleValue()
return replaceWithDefault() ? ZERO_DOUBLE : null;
}

public static String[] defaultStringValues()
{
return NULL_STRING_ARRAY;
}

/**
* Returns the default value for an object of the provided class. Will be null in SQL-compatible null handling mode.
* May be null or some non-null default value when not in SQL-compatible null handling mode.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,10 @@ public class QueryContexts
public static final String ENABLE_DEBUG = "debug";
public static final String BY_SEGMENT_KEY = "bySegment";
public static final String BROKER_SERVICE_NAME = "brokerService";
public static final boolean DEFAULT_POPULATE_CACHE = true;
public static final String ENABLE_UNNESTED_ARRAYS_KEY = "enableUnnestedArrays";

public static final boolean DEFAULT_BY_SEGMENT = false;
public static final boolean DEFAULT_POPULATE_CACHE = true;
public static final boolean DEFAULT_USE_CACHE = true;
public static final boolean DEFAULT_POPULATE_RESULTLEVEL_CACHE = true;
public static final boolean DEFAULT_USE_RESULTLEVEL_CACHE = true;
Expand All @@ -90,6 +91,7 @@ public class QueryContexts
public static final boolean DEFAULT_USE_FILTER_CNF = false;
public static final boolean DEFAULT_SECONDARY_PARTITION_PRUNING = true;
public static final boolean DEFAULT_ENABLE_DEBUG = false;
public static final boolean DEFAULT_ENABLE_UNNESTED_ARRAYS = true;

@SuppressWarnings("unused") // Used by Jackson serialization
public enum Vectorize
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import org.apache.druid.java.util.common.IAE;
import org.apache.druid.query.dimension.ColumnSelectorStrategyFactory;
import org.apache.druid.query.dimension.DimensionSpec;
import org.apache.druid.segment.ColumnValueSelector;
import org.apache.druid.segment.column.ColumnCapabilities;

Expand All @@ -30,7 +31,8 @@ public class CardinalityAggregatorColumnSelectorStrategyFactory
@Override
public CardinalityAggregatorColumnSelectorStrategy makeColumnSelectorStrategy(
ColumnCapabilities capabilities,
ColumnValueSelector selector
ColumnValueSelector selector,
DimensionSpec dimesionSpec
)
{
switch (capabilities.getType()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,9 @@

public interface ColumnSelectorStrategyFactory<ColumnSelectorStrategyClass extends ColumnSelectorStrategy>
{
ColumnSelectorStrategyClass makeColumnSelectorStrategy(ColumnCapabilities capabilities, ColumnValueSelector selector);
ColumnSelectorStrategyClass makeColumnSelectorStrategy(
ColumnCapabilities capabilities,
ColumnValueSelector selector,
DimensionSpec dimensionSpec
cryptoe marked this conversation as resolved.
Show resolved Hide resolved
);
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,7 @@
package org.apache.druid.query.groupby;

import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializerProvider;
Expand All @@ -34,7 +31,6 @@
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.inject.Inject;
import org.apache.druid.data.input.Row;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.granularity.Granularity;
import org.apache.druid.java.util.common.guava.MappedSequence;
Expand Down Expand Up @@ -433,20 +429,7 @@ public void serialize(
}
};

cryptoe marked this conversation as resolved.
Show resolved Hide resolved
// Deserializer that can deserialize either array- or map-based rows.
final JsonDeserializer<ResultRow> deserializer = new JsonDeserializer<ResultRow>()
{
@Override
public ResultRow deserialize(final JsonParser jp, final DeserializationContext ctxt) throws IOException
{
if (jp.isExpectedStartObjectToken()) {
final Row row = jp.readValueAs(Row.class);
return ResultRow.fromLegacyRow(row, query);
} else {
return ResultRow.of(jp.readValueAs(Object[].class));
}
}
};
final ResultRowDeserializer deserializer = ResultRowDeserializer.fromQuery(query);

class GroupByResultRowModule extends SimpleModule
{
Expand Down Expand Up @@ -700,7 +683,6 @@ public Sequence<Object[]> resultsAsArrays(final GroupByQuery query, final Sequen
* as the final step of the query instead of on every event.
*
* @param query The query to check for optimizations
*
* @return The set of dimensions (as offsets into {@code query.getDimensions()}) which can be extracted at the last
* second upon query completion.
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.druid.query.groupby;

import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import org.apache.druid.data.input.Row;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.segment.column.ColumnType;
import org.apache.druid.segment.column.RowSignature;
import org.apache.druid.segment.data.ComparableStringArray;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


public class ResultRowDeserializer extends JsonDeserializer<ResultRow>
{
final List<ColumnType> types;
final GroupByQuery query;

public ResultRowDeserializer(final List<ColumnType> types, final GroupByQuery query)
{
this.types = types;
this.query = query;
}

public static ResultRowDeserializer fromQuery(
final GroupByQuery query
)
{
RowSignature rowSignature = query.getResultRowSignature();
final List<ColumnType> types = new ArrayList<>(rowSignature.size());

for (String name : rowSignature.getColumnNames()) {
final ColumnType type = rowSignature.getColumnType(name)
.orElseThrow(() -> new ISE("No type for column [%s]", name));

types.add(type);
}

return new ResultRowDeserializer(types, query);

}

@Override
public ResultRow deserialize(JsonParser jp, DeserializationContext ctxt) throws IOException
{
// Deserializer that can deserialize either array- or map-based rows.
if (jp.isExpectedStartObjectToken()) {
final Row row = jp.readValueAs(Row.class);
return ResultRow.fromLegacyRow(row, query);
} else if (jp.isExpectedStartArrayToken()) {
final Object[] retVal = new Object[types.size()];

for (int i = 0; i < types.size(); i++) {
final JsonToken token = jp.nextToken();
switch (types.get(i).getType()) {
case STRING:
if (token == JsonToken.VALUE_NULL) {
retVal[i] = null;
} else if (token == JsonToken.VALUE_STRING) {
retVal[i] = jp.getText();
} else {
throw ctxt.instantiationException(
ResultRow.class,
StringUtils.format("Unexpected token [%s] when reading string", token)
);
}
break;

case LONG:
retVal[i] = token == JsonToken.VALUE_NULL ? null : jp.getLongValue();
break;
case DOUBLE:
retVal[i] = token == JsonToken.VALUE_NULL ? null : jp.getDoubleValue();
break;
case FLOAT:
retVal[i] = token == JsonToken.VALUE_NULL ? null : jp.getFloatValue();
break;
case ARRAY:
if (types.get(i).equals(ColumnType.STRING_ARRAY)) {
final List<String> strings = new ArrayList<>();
while (jp.nextToken() != JsonToken.END_ARRAY) {
strings.add(jp.getText());
}
retVal[i] = ComparableStringArray.of(strings.toArray(new String[0]));
break;
}

default:
throw new ISE("Can't handle type [%s]", types.get(i).asTypeString());
cryptoe marked this conversation as resolved.
Show resolved Hide resolved
}
}
if (jp.nextToken() != JsonToken.END_ARRAY) {
throw ctxt.wrongTokenException(jp, ResultRow.class, JsonToken.END_ARRAY, null);
}
return ResultRow.of(retVal);
} else {
return (ResultRow) ctxt.handleUnexpectedToken(ResultRow.class, jp);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import org.apache.druid.query.groupby.GroupByQuery;
import org.apache.druid.query.groupby.GroupByQueryConfig;
import org.apache.druid.query.groupby.ResultRow;
import org.apache.druid.query.groupby.epinephelinae.column.ArrayGroupByColumnSelectorStrategy;
import org.apache.druid.query.groupby.epinephelinae.column.DictionaryBuildingStringGroupByColumnSelectorStrategy;
import org.apache.druid.query.groupby.epinephelinae.column.DoubleGroupByColumnSelectorStrategy;
import org.apache.druid.query.groupby.epinephelinae.column.FloatGroupByColumnSelectorStrategy;
Expand Down Expand Up @@ -290,6 +291,11 @@ public static int getCardinalityForArrayAggregation(
if (query.getVirtualColumns().exists(Iterables.getOnlyElement(dimensions).getDimension())) {
return -1;
}
// We cannot support array-based aggregation on array based grouping as we we donot have all the indexes up front
// to allocate appropriate values
if (dimensions.get(0).getOutputType().equals(ColumnType.STRING_ARRAY)) {
return -1;
}

final String columnName = Iterables.getOnlyElement(dimensions).getDimension();
columnCapabilities = storageAdapter.getColumnCapabilities(columnName);
Expand Down Expand Up @@ -340,7 +346,8 @@ public static boolean isAllSingleValueDims(

// Now check column capabilities, which must be present and explicitly not multi-valued
final ColumnCapabilities columnCapabilities = inspector.getColumnCapabilities(dimension.getDimension());
return columnCapabilities != null && columnCapabilities.hasMultipleValues().isFalse();
return dimension.getOutputType().equals(ColumnType.STRING_ARRAY)
cryptoe marked this conversation as resolved.
Show resolved Hide resolved
|| (columnCapabilities != null && columnCapabilities.hasMultipleValues().isFalse());
});
}

Expand Down Expand Up @@ -386,13 +393,16 @@ private static class GroupByStrategyFactory
@Override
public GroupByColumnSelectorStrategy makeColumnSelectorStrategy(
ColumnCapabilities capabilities,
ColumnValueSelector selector
ColumnValueSelector selector,
DimensionSpec dimensionSpec
)
{
switch (capabilities.getType()) {
case STRING:
cryptoe marked this conversation as resolved.
Show resolved Hide resolved
DimensionSelector dimSelector = (DimensionSelector) selector;
if (dimSelector.getValueCardinality() >= 0) {
if (dimensionSpec.getOutputType().equals(ColumnType.STRING_ARRAY)) {
return new ArrayGroupByColumnSelectorStrategy();
} else if (dimSelector.getValueCardinality() >= 0) {
return new StringGroupByColumnSelectorStrategy(dimSelector::lookupName, capabilities);
} else {
return new DictionaryBuildingStringGroupByColumnSelectorStrategy();
Expand Down
Loading