ability to not rollup at index time, make pre aggregation an option (#…

…3020) * ability to not rollup at index time, make pre aggregation an option * rename getRowIndexForRollup to getPriorIndex * fix doc misspelling * test query using no-rollup indexes * fix benchmark fail due to jmh bug
apache · Aug 2, 2016 · 50d52a2 · 50d52a2
1 parent 0bdaaa2
commit 50d52a2
Show file tree

Hide file tree

Showing 50 changed files with 1,247 additions and 154 deletions.
diff --git a/benchmarks/src/main/java/io/druid/benchmark/indexing/IncrementalIndexReadBenchmark.java b/benchmarks/src/main/java/io/druid/benchmark/indexing/IncrementalIndexReadBenchmark.java
@@ -82,6 +82,9 @@ public class IncrementalIndexReadBenchmark
   @Param({"basic"})
   private String schema;
 
+  @Param({"true", "false"})
+  private boolean rollup;
+
   private static final Logger log = new Logger(IncrementalIndexReadBenchmark.class);
   private static final int RNG_SEED = 9999;
   private IncrementalIndex incIndex;
@@ -125,6 +128,7 @@ private IncrementalIndex makeIncIndex()
             .withQueryGranularity(QueryGranularities.NONE)
             .withMetrics(schemaInfo.getAggsArray())
             .withDimensionsSpec(new DimensionsSpec(null, null, null))
+            .withRollup(rollup)
             .build(),
         true,
         false,

diff --git a/benchmarks/src/main/java/io/druid/benchmark/indexing/IndexIngestionBenchmark.java b/benchmarks/src/main/java/io/druid/benchmark/indexing/IndexIngestionBenchmark.java
@@ -63,6 +63,9 @@ public class IndexIngestionBenchmark
   @Param({"basic"})
   private String schema;
 
+  @Param({"true", "false"})
+  private boolean rollup;
+
   private static final Logger log = new Logger(IndexIngestionBenchmark.class);
   private static final int RNG_SEED = 9999;
 
@@ -107,11 +110,12 @@ private IncrementalIndex makeIncIndex()
             .withQueryGranularity(QueryGranularities.NONE)
             .withMetrics(schemaInfo.getAggsArray())
             .withDimensionsSpec(new DimensionsSpec(null, null, null))
+            .withRollup(rollup)
             .build(),
         true,
         false,
         true,
-        rowsPerSegment
+        rowsPerSegment * 2
     );
   }
 

diff --git a/benchmarks/src/main/java/io/druid/benchmark/indexing/IndexMergeBenchmark.java b/benchmarks/src/main/java/io/druid/benchmark/indexing/IndexMergeBenchmark.java
@@ -75,6 +75,9 @@ public class IndexMergeBenchmark
   @Param({"basic"})
   private String schema;
 
+  @Param({"true", "false"})
+  private boolean rollup;
+
   private static final Logger log = new Logger(IndexMergeBenchmark.class);
   private static final int RNG_SEED = 9999;
   private static final IndexMerger INDEX_MERGER;
@@ -155,6 +158,7 @@ private IncrementalIndex makeIncIndex()
             .withQueryGranularity(QueryGranularities.NONE)
             .withMetrics(schemaInfo.getAggsArray())
             .withDimensionsSpec(new DimensionsSpec(null, null, null))
+            .withRollup(rollup)
             .build(),
         true,
         false,
@@ -174,7 +178,7 @@ public void merge(Blackhole blackhole) throws Exception
     log.info(tmpFile.getAbsolutePath() + " isFile: " + tmpFile.isFile() + " isDir:" + tmpFile.isDirectory());
     tmpFile.deleteOnExit();
 
-    File mergedFile = INDEX_MERGER.mergeQueryableIndex(indexesToMerge, schemaInfo.getAggsArray(), tmpFile, new IndexSpec());
+    File mergedFile = INDEX_MERGER.mergeQueryableIndex(indexesToMerge, rollup, schemaInfo.getAggsArray(), tmpFile, new IndexSpec());
 
     blackhole.consume(mergedFile);
 
@@ -192,7 +196,7 @@ public void mergeV9(Blackhole blackhole) throws Exception
     log.info(tmpFile.getAbsolutePath() + " isFile: " + tmpFile.isFile() + " isDir:" + tmpFile.isDirectory());
     tmpFile.deleteOnExit();
 
-    File mergedFile = INDEX_MERGER_V9.mergeQueryableIndex(indexesToMerge, schemaInfo.getAggsArray(), tmpFile, new IndexSpec());
+    File mergedFile = INDEX_MERGER_V9.mergeQueryableIndex(indexesToMerge, rollup, schemaInfo.getAggsArray(), tmpFile, new IndexSpec());
 
     blackhole.consume(mergedFile);
 

diff --git a/benchmarks/src/main/java/io/druid/benchmark/indexing/IndexPersistBenchmark.java b/benchmarks/src/main/java/io/druid/benchmark/indexing/IndexPersistBenchmark.java
@@ -72,6 +72,9 @@ public class IndexPersistBenchmark
   @Param({"basic"})
   private String schema;
 
+  @Param({"true", "false"})
+  private boolean rollup;
+
   private static final Logger log = new Logger(IndexPersistBenchmark.class);
   private static final int RNG_SEED = 9999;
 
@@ -156,6 +159,7 @@ private IncrementalIndex makeIncIndex()
             .withQueryGranularity(QueryGranularities.NONE)
             .withMetrics(schemaInfo.getAggsArray())
             .withDimensionsSpec(new DimensionsSpec(null, null, null))
+            .withRollup(rollup)
             .build(),
         true,
         false,

diff --git a/docs/content/ingestion/index.md b/docs/content/ingestion/index.md
@@ -186,6 +186,7 @@ This spec is used to generated segments with uniform intervals.
 | type | string | The type of granularity spec. | no (default == 'uniform') |
 | segmentGranularity | string | The granularity to create segments at. | no (default == 'DAY') |
 | queryGranularity | string | The minimum granularity to be able to query results at and the granularity of the data inside the segment. E.g. a value of "minute" will mean that data is aggregated at minutely granularity. That is, if there are collisions in the tuple (minute(timestamp), dimensions), then it will aggregate values together using the aggregators instead of storing individual rows. | no (default == 'NONE') |
+| rollup | boolean | rollup or not | no (default == true) |
 | intervals | string | A list of intervals for the raw data being ingested. Ignored for real-time ingestion. | yes for batch, no for real-time |
 
 ### Arbitrary Granularity Spec
@@ -196,6 +197,7 @@ This spec is used to generate segments with arbitrary intervals (it tries to cre
 |-------|------|-------------|----------|
 | type | string | The type of granularity spec. | no (default == 'uniform') |
 | queryGranularity | string | The minimum granularity to be able to query results at and the granularity of the data inside the segment. E.g. a value of "minute" will mean that data is aggregated at minutely granularity. That is, if there are collisions in the tuple (minute(timestamp), dimensions), then it will aggregate values together using the aggregators instead of storing individual rows. | no (default == 'NONE') |
+| rollup | boolean | rollup or not | no (default == true) |
 | intervals | string | A list of intervals for the raw data being ingested. Ignored for real-time ingestion. | yes for batch, no for real-time |
 
 # IO Config

diff --git a/docs/content/ingestion/tasks.md b/docs/content/ingestion/tasks.md
@@ -159,14 +159,18 @@ Append tasks append a list of segments together into a single segment (one after
 
 ### Merge Task
 
-Merge tasks merge a list of segments together. Any common timestamps are merged. The grammar is:
+Merge tasks merge a list of segments together. Any common timestamps are merged.
+If rollup is disabled as part of ingestion, common timestamps are not merged and rows are reordered by their timestamp.
+
+The grammar is:
 
 ```json
 {
     "type": "merge",
     "id": <task_id>,
     "dataSource": <task_datasource>,
     "aggregations": <list of aggregators>,
+    "rollup": <whether or not to rollup data during a merge>,
     "segments": <JSON list of DataSegment objects to merge>
 }
 ```

diff --git a/docs/content/querying/segmentmetadataquery.md b/docs/content/querying/segmentmetadataquery.md
@@ -11,6 +11,7 @@ Segment metadata queries return per-segment information about:
 * Interval the segment covers
 * Column type of all the columns in the segment
 * Estimated total segment byte size in if it was stored in a flat format
+* Is the segment rolled up
 * Segment id
 
 ```json
@@ -143,6 +144,11 @@ null if the aggregators are unknown or unmergeable (if merging is enabled).
 
 * The form of the result is a map of column name to aggregator.
 
+#### rollup
+
+* `rollup` in the result is true/false/null.
+* When merging is enabled, if some are rollup, others are not, result is null.
+
 ### lenientAggregatorMerge
 
 Conflicts between aggregator metadata across segments can occur if some segments have unknown aggregators, or if

diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/DetermineHashedPartitionsJob.java b/indexing-hadoop/src/main/java/io/druid/indexer/DetermineHashedPartitionsJob.java
@@ -142,6 +142,7 @@ public boolean run()
             new UniformGranularitySpec(
                 config.getGranularitySpec().getSegmentGranularity(),
                 config.getGranularitySpec().getQueryGranularity(),
+                config.getGranularitySpec().isRollup(),
                 intervals
             )
         );

diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/IndexGeneratorJob.java b/indexing-hadoop/src/main/java/io/druid/indexer/IndexGeneratorJob.java
@@ -226,6 +226,7 @@ private static IncrementalIndex makeIncrementalIndex(
         .withDimensionsSpec(config.getSchema().getDataSchema().getParser())
         .withQueryGranularity(config.getSchema().getDataSchema().getGranularitySpec().getQueryGranularity())
         .withMetrics(aggs)
+        .withRollup(config.getSchema().getDataSchema().getGranularitySpec().isRollup())
         .build();
 
     OnheapIncrementalIndex newIndex = new OnheapIncrementalIndex(
@@ -514,13 +515,14 @@ protected File mergeQueryableIndex(
         ProgressIndicator progressIndicator
     ) throws IOException
     {
+      boolean rollup = config.getSchema().getDataSchema().getGranularitySpec().isRollup();
       if (config.isBuildV9Directly()) {
         return HadoopDruidIndexerConfig.INDEX_MERGER_V9.mergeQueryableIndex(
-            indexes, aggs, file, config.getIndexSpec(), progressIndicator
+            indexes, rollup, aggs, file, config.getIndexSpec(), progressIndicator
         );
       } else {
         return HadoopDruidIndexerConfig.INDEX_MERGER.mergeQueryableIndex(
-            indexes, aggs, file, config.getIndexSpec(), progressIndicator
+            indexes, rollup, aggs, file, config.getIndexSpec(), progressIndicator
         );
       }
     }

diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/path/GranularUnprocessedPathSpec.java b/indexing-hadoop/src/main/java/io/druid/indexer/path/GranularUnprocessedPathSpec.java
@@ -112,6 +112,7 @@ public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOExce
         new UniformGranularitySpec(
             segmentGranularity,
             config.getGranularitySpec().getQueryGranularity(),
+            config.getGranularitySpec().isRollup(),
             Lists.newArrayList(bucketsToRun)
         )
     );

diff --git a/indexing-service/src/main/java/io/druid/indexing/common/index/YeOldePlumberSchool.java b/indexing-service/src/main/java/io/druid/indexing/common/index/YeOldePlumberSchool.java
@@ -188,7 +188,7 @@ public void finishJob()
             }
 
             fileToUpload = new File(tmpSegmentDir, "merged");
-            theIndexMerger.mergeQueryableIndex(indexes, schema.getAggregators(), fileToUpload, config.getIndexSpec());
+            theIndexMerger.mergeQueryableIndex(indexes, schema.getGranularitySpec().isRollup(), schema.getAggregators(), fileToUpload, config.getIndexSpec());
           }
 
           // Map merged segment so we can extract dimensions

diff --git a/indexing-service/src/main/java/io/druid/indexing/common/task/MergeTask.java b/indexing-service/src/main/java/io/druid/indexing/common/task/MergeTask.java
@@ -44,6 +44,7 @@ public class MergeTask extends MergeTaskBase
 {
   @JsonIgnore
   private final List<AggregatorFactory> aggregators;
+  private final Boolean rollup;
   private final IndexSpec indexSpec;
 
   @JsonCreator
@@ -52,12 +53,14 @@ public MergeTask(
       @JsonProperty("dataSource") String dataSource,
       @JsonProperty("segments") List<DataSegment> segments,
       @JsonProperty("aggregations") List<AggregatorFactory> aggregators,
+      @JsonProperty("rollup") Boolean rollup,
       @JsonProperty("indexSpec") IndexSpec indexSpec,
       @JsonProperty("context") Map<String, Object> context
   )
   {
     super(id, dataSource, segments, context);
     this.aggregators = Preconditions.checkNotNull(aggregators, "null aggregations");
+    this.rollup = rollup == null ? Boolean.TRUE : rollup;
     this.indexSpec = indexSpec == null ? new IndexSpec() : indexSpec;
   }
 
@@ -82,6 +85,7 @@ public QueryableIndex apply(@Nullable File input)
               }
             }
         ),
+        rollup,
         aggregators.toArray(new AggregatorFactory[aggregators.size()]),
         outDir,
         indexSpec

diff --git a/indexing-service/src/test/java/io/druid/indexing/common/task/TaskSerdeTest.java b/indexing-service/src/test/java/io/druid/indexing/common/task/TaskSerdeTest.java
@@ -176,6 +176,7 @@ public void testMergeTaskSerde() throws Exception
         "foo",
         segments,
         aggregators,
+        true,
         indexSpec,
         null
     );

diff --git a/processing/src/main/java/io/druid/query/metadata/SegmentMetadataQueryQueryToolChest.java b/processing/src/main/java/io/druid/query/metadata/SegmentMetadataQueryQueryToolChest.java
@@ -355,6 +355,14 @@ public static SegmentAnalysis mergeAnalyses(
       mergedId = "merged";
     }
 
+    final Boolean rollup;
+
+    if (arg1.isRollup() != null && arg2.isRollup() != null && arg1.isRollup().equals(arg2.isRollup())) {
+      rollup = arg1.isRollup();
+    } else {
+      rollup = null;
+    }
+
     return new SegmentAnalysis(
         mergedId,
         newIntervals,
@@ -363,7 +371,8 @@ public static SegmentAnalysis mergeAnalyses(
         arg1.getNumRows() + arg2.getNumRows(),
         aggregators.isEmpty() ? null : aggregators,
         timestampSpec,
-        queryGranularity
+        queryGranularity,
+        rollup
     );
   }
 
@@ -378,7 +387,8 @@ public static SegmentAnalysis finalizeAnalysis(SegmentAnalysis analysis)
         analysis.getNumRows(),
         analysis.getAggregators(),
         analysis.getTimestampSpec(),
-        analysis.getQueryGranularity()
+        analysis.getQueryGranularity(),
+        analysis.isRollup()
     );
   }
 }
diff --git a/processing/src/main/java/io/druid/query/metadata/SegmentMetadataQueryRunnerFactory.java b/processing/src/main/java/io/druid/query/metadata/SegmentMetadataQueryRunnerFactory.java
@@ -148,6 +148,19 @@ public Sequence<SegmentAnalysis> run(Query<SegmentAnalysis> inQ, Map<String, Obj
           queryGranularity = null;
         }
 
+        Boolean rollup = null;
+        if (query.hasRollup()) {
+          if (metadata == null) {
+            metadata = segment.asStorageAdapter().getMetadata();
+          }
+          rollup = metadata != null ? metadata.isRollup() : null;
+          if (rollup == null) {
+            // in this case, this segment is built before no-rollup function is coded,
+            // thus it is built with rollup
+            rollup = Boolean.TRUE;
+          }
+        }
+
         return Sequences.simple(
             Arrays.asList(
                 new SegmentAnalysis(
@@ -158,7 +171,8 @@ public Sequence<SegmentAnalysis> run(Query<SegmentAnalysis> inQ, Map<String, Obj
                     numRows,
                     aggregators,
                     timestampSpec,
-                    queryGranularity
+                    queryGranularity,
+                    rollup
                 )
             )
         );

diff --git a/processing/src/main/java/io/druid/query/metadata/metadata/SegmentAnalysis.java b/processing/src/main/java/io/druid/query/metadata/metadata/SegmentAnalysis.java
@@ -40,6 +40,7 @@ public class SegmentAnalysis implements Comparable<SegmentAnalysis>
   private final Map<String, AggregatorFactory> aggregators;
   private final TimestampSpec timestampSpec;
   private final QueryGranularity queryGranularity;
+  private final Boolean rollup;
 
   @JsonCreator
   public SegmentAnalysis(
@@ -50,7 +51,8 @@ public SegmentAnalysis(
       @JsonProperty("numRows") long numRows,
       @JsonProperty("aggregators") Map<String, AggregatorFactory> aggregators,
       @JsonProperty("timestampSpec") TimestampSpec timestampSpec,
-      @JsonProperty("queryGranularity") QueryGranularity queryGranularity
+      @JsonProperty("queryGranularity") QueryGranularity queryGranularity,
+      @JsonProperty("rollup") Boolean rollup
   )
   {
     this.id = id;
@@ -61,6 +63,7 @@ public SegmentAnalysis(
     this.aggregators = aggregators;
     this.timestampSpec = timestampSpec;
     this.queryGranularity = queryGranularity;
+    this.rollup = rollup;
   }
 
   @JsonProperty
@@ -105,6 +108,12 @@ public QueryGranularity getQueryGranularity()
     return queryGranularity;
   }
 
+  @JsonProperty
+  public Boolean isRollup()
+  {
+    return rollup;
+  }
+
   @JsonProperty
   public Map<String, AggregatorFactory> getAggregators()
   {
@@ -123,6 +132,7 @@ public String toString()
            ", aggregators=" + aggregators +
            ", timestampSpec=" + timestampSpec +
            ", queryGranularity=" + queryGranularity +
+           ", rollup=" + rollup +
            '}';
   }
 
@@ -141,6 +151,7 @@ public boolean equals(Object o)
     SegmentAnalysis that = (SegmentAnalysis) o;
     return size == that.size &&
            numRows == that.numRows &&
+           rollup == that.rollup &&
            Objects.equals(id, that.id) &&
            Objects.equals(interval, that.interval) &&
            Objects.equals(columns, that.columns) &&
@@ -156,7 +167,7 @@ public boolean equals(Object o)
   @Override
   public int hashCode()
   {
-    return Objects.hash(id, interval, columns, size, numRows, aggregators, timestampSpec, queryGranularity);
+    return Objects.hash(id, interval, columns, size, numRows, aggregators, timestampSpec, queryGranularity, rollup);
   }
 
   @Override

diff --git a/processing/src/main/java/io/druid/query/metadata/metadata/SegmentMetadataQuery.java b/processing/src/main/java/io/druid/query/metadata/metadata/SegmentMetadataQuery.java
@@ -58,7 +58,8 @@ public enum AnalysisType
     AGGREGATORS,
     MINMAX,
     TIMESTAMPSPEC,
-    QUERYGRANULARITY;
+    QUERYGRANULARITY,
+    ROLLUP;
 
     @JsonValue
     @Override
@@ -199,6 +200,11 @@ public boolean hasQueryGranularity()
     return analysisTypes.contains(AnalysisType.QUERYGRANULARITY);
   }
 
+  public boolean hasRollup()
+  {
+    return analysisTypes.contains(AnalysisType.ROLLUP);
+  }
+
   public boolean hasMinMax()
   {
     return analysisTypes.contains(AnalysisType.MINMAX);