timescale · akuzm · Oct 2, 2024 · Oct 2, 2024 · Oct 2, 2024 · Oct 2, 2024
diff --git a/.github/workflows/linux-32bit-build-and-test.yaml b/.github/workflows/linux-32bit-build-and-test.yaml
@@ -47,7 +47,9 @@ jobs:
  CC: clang-14
  CXX: clang++-14
  DEBIAN_FRONTEND: noninteractive
- IGNORES: "append-* transparent_decompression-* transparent_decompress_chunk-* pg_dump telemetry bgw_db_scheduler*"
+ # vectorized_aggregation has different output on i386 because int8 is by
+ # reference and currently it cannot be used for vectorized hash grouping.
+ IGNORES: "append-* transparent_decompression-* transparent_decompress_chunk-* pg_dump telemetry bgw_db_scheduler* vectorized_aggregation"
  SKIPS: chunk_adaptive histogram_test-*
  EXTENSIONS: "postgres_fdw test_decoding"
  strategy:

diff --git a/.unreleased/vectorized-grouping-one-fixed b/.unreleased/vectorized-grouping-one-fixed
@@ -0,0 +1 @@
+Implements: #7341 Vectorized aggregation with grouping by one fixed-size by-value compressed column (such as arithmetic types).
diff --git a/tsl/src/compression/algorithms/deltadelta_impl.c b/tsl/src/compression/algorithms/deltadelta_impl.c
@@ -44,20 +44,20 @@ FUNCTION_NAME(delta_delta_decompress_all, ELEMENT_TYPE)(Datum compressed, Memory
  * Pad the number of elements to multiple of 64 bytes if needed, so that we
  * can work in 64-byte blocks.
  */
+#define INNER_LOOP_SIZE_LOG2 3
+#define INNER_LOOP_SIZE (1 << INNER_LOOP_SIZE_LOG2)
  const uint32 n_total = has_nulls ? nulls.num_elements : num_deltas;
- const uint32 n_total_padded =
- ((n_total * sizeof(ELEMENT_TYPE) + 63) / 64) * 64 / sizeof(ELEMENT_TYPE);
+ const uint32 n_total_padded = pad_to_multiple(INNER_LOOP_SIZE, n_total);
  const uint32 n_notnull = num_deltas;
- const uint32 n_notnull_padded =
- ((n_notnull * sizeof(ELEMENT_TYPE) + 63) / 64) * 64 / sizeof(ELEMENT_TYPE);
+ const uint32 n_notnull_padded = pad_to_multiple(INNER_LOOP_SIZE, n_notnull);
  Assert(n_total_padded >= n_total);
  Assert(n_notnull_padded >= n_notnull);
  Assert(n_total >= n_notnull);
  Assert(n_total <= GLOBAL_MAX_ROWS_PER_COMPRESSION);
 
  /*
  * We need additional padding at the end of buffer, because the code that
- * converts the elements to postres Datum always reads in 8 bytes.
+ * converts the elements to postgres Datum always reads in 8 bytes.
  */
  const int buffer_bytes = n_total_padded * sizeof(ELEMENT_TYPE) + 8;
  ELEMENT_TYPE *restrict decompressed_values = MemoryContextAlloc(dest_mctx, buffer_bytes);
@@ -75,7 +75,6 @@ FUNCTION_NAME(delta_delta_decompress_all, ELEMENT_TYPE)(Datum compressed, Memory
  * Also tried zig-zag decoding in a separate loop, seems to be slightly
  * slower, around the noise threshold.
  */
-#define INNER_LOOP_SIZE 8
  Assert(n_notnull_padded % INNER_LOOP_SIZE == 0);
  for (uint32 outer = 0; outer < n_notnull_padded; outer += INNER_LOOP_SIZE)
  {
@@ -86,6 +85,7 @@ FUNCTION_NAME(delta_delta_decompress_all, ELEMENT_TYPE)(Datum compressed, Memory
  decompressed_values[outer + inner] = current_element;
  }
  }
+#undef INNER_LOOP_SIZE_LOG2
 #undef INNER_LOOP_SIZE
 
  uint64 *restrict validity_bitmap = NULL;

diff --git a/tsl/src/compression/arrow_c_data_interface.h b/tsl/src/compression/arrow_c_data_interface.h
@@ -190,7 +190,7 @@ pad_to_multiple(uint64 pad_to, uint64 source_value)
 }
 
 static inline size_t
-arrow_num_valid(uint64 *bitmap, size_t total_rows)
+arrow_num_valid(const uint64 *bitmap, size_t total_rows)
 {
  if (bitmap == NULL)
  {

diff --git a/tsl/src/nodes/vector_agg/CMakeLists.txt b/tsl/src/nodes/vector_agg/CMakeLists.txt
@@ -2,5 +2,6 @@ add_subdirectory(function)
 set(SOURCES
  ${CMAKE_CURRENT_SOURCE_DIR}/exec.c
  ${CMAKE_CURRENT_SOURCE_DIR}/grouping_policy_batch.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/grouping_policy_hash.c
  ${CMAKE_CURRENT_SOURCE_DIR}/plan.c)
 target_sources(${TSL_LIBRARY_NAME} PRIVATE ${SOURCES})
diff --git a/tsl/src/nodes/vector_agg/exec.c b/tsl/src/nodes/vector_agg/exec.c
@@ -90,7 +90,7 @@ vector_agg_begin(CustomScanState *node, EState *estate, int eflags)
  Aggref *aggref = castNode(Aggref, tlentry->expr);
  VectorAggFunctions *func = get_vector_aggregate(aggref->aggfnoid);
  Assert(func != NULL);
- def->func = func;
+ def->func = *func;
 
  if (list_length(aggref->args) > 0)
  {
@@ -122,11 +122,33 @@ vector_agg_begin(CustomScanState *node, EState *estate, int eflags)
  }
  }
 
- List *grouping_column_offsets = linitial(cscan->custom_private);
- vector_agg_state->grouping =
- create_grouping_policy_batch(vector_agg_state->agg_defs,
- vector_agg_state->output_grouping_columns,
- /* partial_per_batch = */ grouping_column_offsets != NIL);
+ if (list_length(vector_agg_state->output_grouping_columns) == 1)
+ {
+ GroupingColumn *col =
+ (GroupingColumn *) linitial(vector_agg_state->output_grouping_columns);
+ DecompressContext *dcontext = &decompress_state->decompress_context;
+ CompressionColumnDescription *desc = &dcontext->compressed_chunk_columns[col->input_offset];
+ if (desc->type == COMPRESSED_COLUMN && desc->by_value && desc->value_bytes > 0 &&
+ (size_t) desc->value_bytes <= sizeof(Datum))
+ {
+ /*
+ * Hash grouping by a single fixed-size by-value compressed column.
+ */
+ vector_agg_state->grouping =
+ create_grouping_policy_hash(vector_agg_state->agg_defs,
+ vector_agg_state->output_grouping_columns);
+ }
+ }
+
+ if (vector_agg_state->grouping == NULL)
+ {
+ /*
+ * Per-batch grouping.
+ */
+ vector_agg_state->grouping =
+ create_grouping_policy_batch(vector_agg_state->agg_defs,
+ vector_agg_state->output_grouping_columns);
+ }
 }
 
 static void

diff --git a/tsl/src/nodes/vector_agg/exec.h b/tsl/src/nodes/vector_agg/exec.h
@@ -15,7 +15,7 @@
 
 typedef struct
 {
- VectorAggFunctions *func;
+ VectorAggFunctions func;
  int input_offset;
  int output_offset;
 } VectorAggDef;

diff --git a/tsl/src/nodes/vector_agg/function/agg_many_vector_helper.c b/tsl/src/nodes/vector_agg/function/agg_many_vector_helper.c
@@ -0,0 +1,33 @@
+/*
+ * This file and its contents are licensed under the Timescale License.
+ * Please see the included NOTICE for copyright information and
+ * LICENSE-TIMESCALE for a copy of the license.
+ */
+
+/*
+ * A generic implementation of adding the given batch to many aggregate function
+ * states with given offsets. Used for hash aggregation, and builds on the
+ * FUNCTION_NAME(one) function, which adds one passing non-null row to the given
+ * aggregate function state.
+ */
+static void
+FUNCTION_NAME(many_vector)(void *restrict agg_states, uint32 *restrict offsets, int start_row,
+ int end_row, const ArrowArray *vector, MemoryContext agg_extra_mctx)
+{
+ MemoryContext old = MemoryContextSwitchTo(agg_extra_mctx);
+ const CTYPE *values = vector->buffers[1];
+ const uint64 *valid = vector->buffers[0];
+ for (int row = start_row; row < end_row; row++)
+ {
+ FUNCTION_NAME(state) *state = (offsets[row] + (FUNCTION_NAME(state) *) agg_states);
+ const CTYPE value = values[row];
+ const bool row_passes = (offsets[row] != 0);
+ const bool value_notnull = arrow_row_is_valid(valid, row);
+
+ if (row_passes && value_notnull)
+ {
+ FUNCTION_NAME(one)(state, value);
+ }
+ }
+ MemoryContextSwitchTo(old);
+}
diff --git a/...es/vector_agg/function/agg_const_helper.c → ...s/vector_agg/function/agg_scalar_helper.c b/...es/vector_agg/function/agg_const_helper.c → ...s/vector_agg/function/agg_scalar_helper.c
@@ -11,14 +11,20 @@
  * implementation otherwise.
  */
 static void
-FUNCTION_NAME(const)(void *agg_state, Datum constvalue, bool constisnull, int n,
- MemoryContext agg_extra_mctx)
+FUNCTION_NAME(scalar)(void *agg_state, Datum constvalue, bool constisnull, int n,
+  MemoryContext agg_extra_mctx)
 {
- const uint64 valid = constisnull ? 0 : 1;
- const CTYPE value = valid ? DATUM_TO_CTYPE(constvalue) : 0;
+ if (constisnull)
+ {
+ return;
+ }
+
+ const CTYPE value = DATUM_TO_CTYPE(constvalue);
 
+ MemoryContext old = MemoryContextSwitchTo(agg_extra_mctx);
  for (int i = 0; i < n; i++)
  {
- FUNCTION_NAME(vector_impl)(agg_state, 1, &value, &valid, NULL, agg_extra_mctx);
+ FUNCTION_NAME(one)(agg_state, value);
  }
+ MemoryContextSwitchTo(old);
 }
diff --git a/tsl/src/nodes/vector_agg/function/float48_accum_single.c b/tsl/src/nodes/vector_agg/function/float48_accum_single.c
@@ -55,10 +55,13 @@ typedef struct
 } FUNCTION_NAME(state);
 
 static void
-FUNCTION_NAME(init)(void *agg_state)
+FUNCTION_NAME(init)(void *restrict agg_states, int n)
 {
- FUNCTION_NAME(state) *state = (FUNCTION_NAME(state) *) agg_state;
- *state = (FUNCTION_NAME(state)){ 0 };
+ FUNCTION_NAME(state) *states = (FUNCTION_NAME(state) *) agg_states;
+ for (int i = 0; i < n; i++)
+ {
+ states[i] = (FUNCTION_NAME(state)){ 0 };
+ }
 }
 
 static void
@@ -290,14 +293,44 @@ FUNCTION_NAME(vector_impl)(void *agg_state, size_t n, const CTYPE *values, const
  COMBINE(&state->N, &state->Sx, &state->Sxx, Narray[0], Sxarray[0], Sxxarray[0]);
 }
 
-#include "agg_const_helper.c"
+static pg_attribute_always_inline void
+FUNCTION_NAME(one)(void *restrict agg_state, const CTYPE value)
+{
+ FUNCTION_NAME(state) *state = (FUNCTION_NAME(state) *) agg_state;
+ /*
+ * This code follows the Postgres float8_accum() transition function, see
+ * the comments there.
+ */
+ const double newN = state->N + 1.0;
+ const double newSx = state->Sx + value;
+#ifdef NEED_SXX
+ if (state->N > 0.0)
+ {
+ const double tmp = value * newN - newSx;
+ state->Sxx += tmp * tmp / (state->N * newN);
+ }
+ else
+ {
+ state->Sxx = 0 * value;
+ }
+#endif
+
+ state->N = newN;
+ state->Sx = newSx;
+}
+
+#include "agg_many_vector_helper.c"
+#include "agg_scalar_helper.c"
 #include "agg_vector_validity_helper.c"
 
-VectorAggFunctions FUNCTION_NAME(argdef) = { .state_bytes = sizeof(FUNCTION_NAME(state)),
- .agg_init = FUNCTION_NAME(init),
- .agg_emit = FUNCTION_NAME(emit),
- .agg_const = FUNCTION_NAME(const),
- .agg_vector = FUNCTION_NAME(vector) };
+VectorAggFunctions FUNCTION_NAME(argdef) = {
+ .state_bytes = sizeof(FUNCTION_NAME(state)),
+ .agg_init = FUNCTION_NAME(init),
+ .agg_emit = FUNCTION_NAME(emit),
+ .agg_scalar = FUNCTION_NAME(scalar),
+ .agg_vector = FUNCTION_NAME(vector),
+ .agg_many_vector = FUNCTION_NAME(many_vector),
+};
 #undef UPDATE
 #undef COMBINE
 

diff --git a/tsl/src/nodes/vector_agg/function/functions.c b/tsl/src/nodes/vector_agg/function/functions.c
@@ -27,10 +27,13 @@ typedef struct
 } CountState;
 
 static void
-count_init(void *agg_state)
+count_init(void *restrict agg_states, int n)
 {
- CountState *state = (CountState *) agg_state;
- state->count = 0;
+ CountState *states = (CountState *) agg_states;
+ for (int i = 0; i < n; i++)
+ {
+ states[i].count = 0;
+ }
 }
 
 static void
@@ -42,26 +45,44 @@ count_emit(void *agg_state, Datum *out_result, bool *out_isnull)
 }
 
 static void
-count_star_const(void *agg_state, Datum constvalue, bool constisnull, int n,
- MemoryContext agg_extra_mctx)
+count_star_scalar(void *agg_state, Datum constvalue, bool constisnull, int n,
+  MemoryContext agg_extra_mctx)
 {
  CountState *state = (CountState *) agg_state;
  state->count += n;
 }
 
+static void
+count_star_many_scalar(void *restrict agg_states, uint32 *restrict offsets, int start_row,
+ int end_row, Datum constvalue, bool constisnull,
+ MemoryContext agg_extra_mctx)
+{
+ CountState *states = (CountState *) agg_states;
+ for (int row = start_row; row < end_row; row++)
+ {
+ if (offsets[row] == 0)
+ {
+ continue;
+ }
+
+ states[offsets[row]].count++;
+ }
+}
+
 VectorAggFunctions count_star_agg = {
  .state_bytes = sizeof(CountState),
  .agg_init = count_init,
- .agg_const = count_star_const,
+ .agg_scalar = count_star_scalar,
  .agg_emit = count_emit,
+ .agg_many_scalar = count_star_many_scalar,
 };
 
 /*
  * Aggregate function count(x).
  */
 static void
-count_any_const(void *agg_state, Datum constvalue, bool constisnull, int n,
- MemoryContext agg_extra_mctx)
+count_any_scalar(void *agg_state, Datum constvalue, bool constisnull, int n,
+  MemoryContext agg_extra_mctx)
 {
  if (constisnull)
  {
@@ -73,8 +94,8 @@ count_any_const(void *agg_state, Datum constvalue, bool constisnull, int n,
 }
 
 static void
-count_any_vector(void *agg_state, const ArrowArray *vector, const uint64 *filter,
- MemoryContext agg_extra_mctx)
+count_any_many_vector(void *agg_state, const ArrowArray *vector, const uint64 *filter,
+   MemoryContext agg_extra_mctx)
 {
  CountState *state = (CountState *) agg_state;
  const int n = vector->length;
@@ -110,12 +131,30 @@ count_any_vector(void *agg_state, const ArrowArray *vector, const uint64 *filter
  }
 }
 
+static void
+count_any_many(void *restrict agg_states, uint32 *restrict offsets, int start_row, int end_row,
+ const ArrowArray *vector, MemoryContext agg_extra_mctx)
+{
+ const uint64 *valid = vector->buffers[0];
+ for (int row = start_row; row < end_row; row++)
+ {
+ CountState *state = (offsets[row] + (CountState *) agg_states);
+ const bool row_passes = (offsets[row] != 0);
+ const bool value_notnull = arrow_row_is_valid(valid, row);
+ if (row_passes && value_notnull)
+ {
+ state->count++;
+ }
+ }
+}
+
 VectorAggFunctions count_any_agg = {
  .state_bytes = sizeof(CountState),
  .agg_init = count_init,
  .agg_emit = count_emit,
- .agg_const = count_any_const,
- .agg_vector = count_any_vector,
+ .agg_scalar = count_any_scalar,
+ .agg_vector = count_any_many_vector,
+ .agg_many_vector = count_any_many,
 };
 
 /*