From cffae8220e89d691e8985348a3e0cb012c647e17 Mon Sep 17 00:00:00 2001 From: daidai <2017501503@qq.com> Date: Tue, 9 Jul 2024 15:15:54 +0800 Subject: [PATCH] =?UTF-8?q?[opt](serde)Optimize=20the=20filling=20of=20fix?= =?UTF-8?q?ed=20values=20=E2=80=8B=E2=80=8Binto=20block=20columns=20withou?= =?UTF-8?q?t=20repeated=20deserialization.=20(#37377)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Proposed changes Since the value of the partition column is fixed when querying the partition table, we can deserialize the value only once and then repeatedly insert the value into the block. ```sql in Hive: CREATE TABLE parquet_partition_tb ( col1 STRING, col2 INT, col3 DOUBLE ) PARTITIONED BY ( partition_col1 STRING, partition_col2 INT ) STORED AS PARQUET; insert into parquet_partition_tb partition (partition_col1="hello",partition_col2=1) values("word",2,2.3); insert into parquet_partition_tb partition(partition_col1="hello",partition_col2=1 ) select col1,col2,col3 from parquet_partition_tb where partition_col1="hello" and partition_col2=1; Repeat the `insert into xxx select xxx`operation several times. Doris : before: mysql> select count(partition_col1) from parquet_partition_tb; +-----------------------+ | count(partition_col1) | +-----------------------+ | 33554432 | +-----------------------+ 1 row in set (3.24 sec) mysql> select count(partition_col2) from parquet_partition_tb; +-----------------------+ | count(partition_col2) | +-----------------------+ | 33554432 | +-----------------------+ 1 row in set (3.34 sec) after: mysql> select count(partition_col1) from parquet_partition_tb ; +-----------------------+ | count(partition_col1) | +-----------------------+ | 33554432 | +-----------------------+ 1 row in set (0.79 sec) mysql> select count(partition_col2) from parquet_partition_tb; +-----------------------+ | count(partition_col2) | +-----------------------+ | 33554432 | +-----------------------+ 1 row in set (0.51 sec) ``` ## Summary: test sql `select count(partition_col) from tbl;` Number of lines : 33554432 | |before | after| |---|---|--| |boolean | 3.96|0.47 | |tinyint | 3.39|0.47 | |smallint | 3.14|0.50 | |int |3.34|0.51 | |bigint | 3.61|0.51 | |float | 4.59 |0.51 | |double |4.60| 0.55 | |decimal(5,2)| 3.96 |0.61 | |date | 5.80|0.52 | |timestamp | 7.68 | 0.52 | |string | 3.24 |0.79 | Issue Number: close #xxx --- .../serde/data_type_datetimev2_serde.cpp | 21 +++++++++++++++ .../serde/data_type_datetimev2_serde.h | 5 ++++ .../serde/data_type_datev2_serde.cpp | 21 +++++++++++++++ .../data_types/serde/data_type_datev2_serde.h | 6 +++++ .../serde/data_type_decimal_serde.cpp | 26 +++++++++++++++++++ .../serde/data_type_decimal_serde.h | 6 +++++ .../serde/data_type_nullable_serde.cpp | 20 ++++++++++++++ .../serde/data_type_nullable_serde.h | 3 +++ .../serde/data_type_number_serde.cpp | 22 ++++++++++++++++ .../data_types/serde/data_type_number_serde.h | 6 +++++ be/src/vec/data_types/serde/data_type_serde.h | 21 +++++++++++++++ .../data_types/serde/data_type_string_serde.h | 25 ++++++++++++++++++ be/src/vec/exec/format/orc/vorc_reader.cpp | 9 +++---- .../format/parquet/vparquet_group_reader.cpp | 9 +++---- be/src/vec/exec/scan/vfile_scanner.cpp | 9 +++---- 15 files changed, 191 insertions(+), 18 deletions(-) diff --git a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp index 63a199199a0d10..850ac5766fc119 100644 --- a/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp @@ -247,4 +247,25 @@ Status DataTypeDateTimeV2SerDe::write_column_to_orc(const std::string& timezone, return Status::OK(); } +Status DataTypeDateTimeV2SerDe::deserialize_column_from_fixed_json( + IColumn& column, Slice& slice, int rows, int* num_deserialized, + const FormatOptions& options) const { + Status st = deserialize_one_cell_from_json(column, slice, options); + if (!st.ok()) { + return st; + } + + DataTypeDateTimeV2SerDe::insert_column_last_value_multiple_times(column, rows - 1); + *num_deserialized = rows; + return Status::OK(); +} + +void DataTypeDateTimeV2SerDe::insert_column_last_value_multiple_times(IColumn& column, + int times) const { + auto& col = static_cast&>(column); + auto sz = col.size(); + UInt64 val = col.get_element(sz - 1); + col.insert_many_vals(val, times); +} + } // namespace doris::vectorized diff --git a/be/src/vec/data_types/serde/data_type_datetimev2_serde.h b/be/src/vec/data_types/serde/data_type_datetimev2_serde.h index 00b05f5fcd6230..ef4aa6843a068c 100644 --- a/be/src/vec/data_types/serde/data_type_datetimev2_serde.h +++ b/be/src/vec/data_types/serde/data_type_datetimev2_serde.h @@ -77,6 +77,11 @@ class DataTypeDateTimeV2SerDe : public DataTypeNumberSerDe { int start, int end, std::vector& buffer_list) const override; + Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows, + int* num_deserialized, + const FormatOptions& options) const override; + void insert_column_last_value_multiple_times(IColumn& column, int times) const override; + private: template Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer& result, diff --git a/be/src/vec/data_types/serde/data_type_datev2_serde.cpp b/be/src/vec/data_types/serde/data_type_datev2_serde.cpp index eb9122dd2408f3..f2d595b87c452f 100644 --- a/be/src/vec/data_types/serde/data_type_datev2_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_datev2_serde.cpp @@ -175,5 +175,26 @@ Status DataTypeDateV2SerDe::write_column_to_orc(const std::string& timezone, con return Status::OK(); } +Status DataTypeDateV2SerDe::deserialize_column_from_fixed_json(IColumn& column, Slice& slice, + int rows, int* num_deserialized, + const FormatOptions& options) const { + Status st = deserialize_one_cell_from_json(column, slice, options); + if (!st.ok()) { + return st; + } + DataTypeDateV2SerDe::insert_column_last_value_multiple_times(column, rows - 1); + *num_deserialized = rows; + return Status::OK(); +} + +void DataTypeDateV2SerDe::insert_column_last_value_multiple_times(IColumn& column, + int times) const { + auto& col = static_cast&>(column); + auto sz = col.size(); + UInt32 val = col.get_element(sz - 1); + + col.insert_many_vals(val, times); +} + } // namespace vectorized } // namespace doris diff --git a/be/src/vec/data_types/serde/data_type_datev2_serde.h b/be/src/vec/data_types/serde/data_type_datev2_serde.h index 9a8b050eeba4a6..52e4cec364ebb6 100644 --- a/be/src/vec/data_types/serde/data_type_datev2_serde.h +++ b/be/src/vec/data_types/serde/data_type_datev2_serde.h @@ -74,6 +74,12 @@ class DataTypeDateV2SerDe : public DataTypeNumberSerDe { int start, int end, std::vector& buffer_list) const override; + Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows, + int* num_deserialized, + const FormatOptions& options) const override; + + void insert_column_last_value_multiple_times(IColumn& column, int times) const override; + private: template Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer& result, diff --git a/be/src/vec/data_types/serde/data_type_decimal_serde.cpp b/be/src/vec/data_types/serde/data_type_decimal_serde.cpp index a59fdedbfe6991..e979211d6d720b 100644 --- a/be/src/vec/data_types/serde/data_type_decimal_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_decimal_serde.cpp @@ -275,6 +275,32 @@ Status DataTypeDecimalSerDe::write_column_to_orc(const std::string& timezone, } return Status::OK(); } +template + +Status DataTypeDecimalSerDe::deserialize_column_from_fixed_json( + IColumn& column, Slice& slice, int rows, int* num_deserialized, + const FormatOptions& options) const { + Status st = deserialize_one_cell_from_json(column, slice, options); + if (!st.ok()) { + return st; + } + + DataTypeDecimalSerDe::insert_column_last_value_multiple_times(column, rows - 1); + *num_deserialized = rows; + return Status::OK(); +} + +template +void DataTypeDecimalSerDe::insert_column_last_value_multiple_times(IColumn& column, + int times) const { + auto& col = static_cast&>(column); + auto sz = col.size(); + + T val = col.get_element(sz - 1); + for (int i = 0; i < times; i++) { + col.insert_value(val); + } +} template class DataTypeDecimalSerDe; template class DataTypeDecimalSerDe; diff --git a/be/src/vec/data_types/serde/data_type_decimal_serde.h b/be/src/vec/data_types/serde/data_type_decimal_serde.h index 55e68699f01b13..484c6686bc58f8 100644 --- a/be/src/vec/data_types/serde/data_type_decimal_serde.h +++ b/be/src/vec/data_types/serde/data_type_decimal_serde.h @@ -114,6 +114,12 @@ class DataTypeDecimalSerDe : public DataTypeSerDe { int start, int end, std::vector& buffer_list) const override; + Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows, + int* num_deserialized, + const FormatOptions& options) const override; + + void insert_column_last_value_multiple_times(IColumn& column, int times) const override; + private: template Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer& result, diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp index faa3c8eb1f45f7..98ff1eb7f81b9b 100644 --- a/be/src/vec/data_types/serde/data_type_nullable_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_nullable_serde.cpp @@ -127,6 +127,26 @@ Status DataTypeNullableSerDe::deserialize_column_from_hive_text_vector( return Status::OK(); } +Status DataTypeNullableSerDe::deserialize_column_from_fixed_json( + IColumn& column, Slice& slice, int rows, int* num_deserialized, + const FormatOptions& options) const { + auto& col = static_cast(column); + Status st = deserialize_one_cell_from_json(column, slice, options); + if (!st.ok()) { + return st; + } + auto& null_map = col.get_null_map_data(); + auto& nested_column = col.get_nested_column(); + + null_map.resize_fill( + rows, null_map.back()); // data_type_nullable::insert_column_last_value_multiple_times() + if (rows - 1 != 0) { + nested_serde->insert_column_last_value_multiple_times(nested_column, rows - 1); + } + *num_deserialized = rows; + return Status::OK(); +} + Status DataTypeNullableSerDe::deserialize_one_cell_from_json(IColumn& column, Slice& slice, const FormatOptions& options) const { auto& null_column = assert_cast(column); diff --git a/be/src/vec/data_types/serde/data_type_nullable_serde.h b/be/src/vec/data_types/serde/data_type_nullable_serde.h index 09d2fbde409acb..7b4841dcbdfd71 100644 --- a/be/src/vec/data_types/serde/data_type_nullable_serde.h +++ b/be/src/vec/data_types/serde/data_type_nullable_serde.h @@ -47,6 +47,9 @@ class DataTypeNullableSerDe : public DataTypeSerDe { int* num_deserialized, const FormatOptions& options) const override; + Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows, + int* num_deserialized, + const FormatOptions& options) const override; Status deserialize_one_cell_from_hive_text( IColumn& column, Slice& slice, const FormatOptions& options, int hive_text_complex_type_delimiter_level = 1) const override; diff --git a/be/src/vec/data_types/serde/data_type_number_serde.cpp b/be/src/vec/data_types/serde/data_type_number_serde.cpp index 0ba338ce39909f..299779ea267961 100644 --- a/be/src/vec/data_types/serde/data_type_number_serde.cpp +++ b/be/src/vec/data_types/serde/data_type_number_serde.cpp @@ -224,6 +224,28 @@ void DataTypeNumberSerDe::read_column_from_arrow(IColumn& column, const auto* raw_data = reinterpret_cast(buffer->data()) + start; col_data.insert(raw_data, raw_data + row_count); } +template +Status DataTypeNumberSerDe::deserialize_column_from_fixed_json( + IColumn& column, Slice& slice, int rows, int* num_deserialized, + const FormatOptions& options) const { + Status st = deserialize_one_cell_from_json(column, slice, options); + if (!st.ok()) { + return st; + } + + DataTypeNumberSerDe::insert_column_last_value_multiple_times(column, rows - 1); + *num_deserialized = rows; + return Status::OK(); +} + +template +void DataTypeNumberSerDe::insert_column_last_value_multiple_times(IColumn& column, + int times) const { + auto& col = static_cast&>(column); + auto sz = col.size(); + T val = col.get_element(sz - 1); + col.insert_many_vals(val, times); +} template template diff --git a/be/src/vec/data_types/serde/data_type_number_serde.h b/be/src/vec/data_types/serde/data_type_number_serde.h index c66bc994605115..18ba2fb26c79b5 100644 --- a/be/src/vec/data_types/serde/data_type_number_serde.h +++ b/be/src/vec/data_types/serde/data_type_number_serde.h @@ -70,6 +70,12 @@ class DataTypeNumberSerDe : public DataTypeSerDe { int* num_deserialized, const FormatOptions& options) const override; + Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows, + int* num_deserialized, + const FormatOptions& options) const override; + + void insert_column_last_value_multiple_times(IColumn& column, int times) const override; + Status write_column_to_pb(const IColumn& column, PValues& result, int start, int end) const override; Status read_column_from_pb(IColumn& column, const PValues& arg) const override; diff --git a/be/src/vec/data_types/serde/data_type_serde.h b/be/src/vec/data_types/serde/data_type_serde.h index 77663e1d43a2a8..1f6e24aef3ffff 100644 --- a/be/src/vec/data_types/serde/data_type_serde.h +++ b/be/src/vec/data_types/serde/data_type_serde.h @@ -234,6 +234,27 @@ class DataTypeSerDe { virtual Status deserialize_column_from_json_vector(IColumn& column, std::vector& slices, int* num_deserialized, const FormatOptions& options) const = 0; + // deserialize fixed values.Repeatedly insert the value row times into the column. + virtual Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows, + int* num_deserialized, + const FormatOptions& options) const { + Status st = deserialize_one_cell_from_json(column, slice, options); + if (!st.ok()) { + *num_deserialized = 0; + return st; + } + insert_column_last_value_multiple_times(column, rows - 1); + *num_deserialized = rows; + return Status::OK(); + } + // Insert the last value to the end of this column multiple times. + virtual void insert_column_last_value_multiple_times(IColumn& column, int times) const { + //If you try to simplify this operation by using `column.insert_many_from(column, column.size() - 1, rows - 1);` + // you are likely to get incorrect data results. + MutableColumnPtr dum_col = column.clone_empty(); + dum_col->insert_from(column, column.size() - 1); + column.insert_many_from(*dum_col.get(), 0, times); + } virtual Status deserialize_one_cell_from_hive_text( IColumn& column, Slice& slice, const FormatOptions& options, diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h b/be/src/vec/data_types/serde/data_type_string_serde.h index b74b585708623f..0f0f1d0dfe88ea 100644 --- a/be/src/vec/data_types/serde/data_type_string_serde.h +++ b/be/src/vec/data_types/serde/data_type_string_serde.h @@ -132,6 +132,31 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { } return Status::OK(); } + + Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows, + int* num_deserialized, + const FormatOptions& options) const override { + Status st = deserialize_one_cell_from_json(column, slice, options); + if (!st.ok()) { + return st; + } + + DataTypeStringSerDeBase::insert_column_last_value_multiple_times(column, rows - 1); + *num_deserialized = rows; + return Status::OK(); + } + + void insert_column_last_value_multiple_times(IColumn& column, int times) const override { + auto& col = static_cast(column); + auto sz = col.size(); + + StringRef ref = col.get_data_at(sz - 1); + String str(ref.data, ref.size); + std::vector refs(times, {str.data(), str.size()}); + + col.insert_many_strings(refs.data(), refs.size()); + } + Status read_column_from_pb(IColumn& column, const PValues& arg) const override { auto& column_dest = assert_cast(column); column_dest.reserve(column_dest.size() + arg.string_value_size()); diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 16909f0023ae11..54d94dcecc7194 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -935,13 +935,10 @@ Status OrcReader::_fill_partition_columns( auto& [value, slot_desc] = kv.second; auto _text_serde = slot_desc->get_data_type_ptr()->get_serde(); Slice slice(value.data(), value.size()); - vector slices(rows); - for (int i = 0; i < rows; i++) { - slices[i] = {value.data(), value.size()}; - } int num_deserialized = 0; - if (_text_serde->deserialize_column_from_json_vector(*col_ptr, slices, &num_deserialized, - _text_formatOptions) != Status::OK()) { + if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows, + &num_deserialized, + _text_formatOptions) != Status::OK()) { return Status::InternalError("Failed to fill partition column: {}={}", slot_desc->col_name(), value); } diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 5e824f34817c1f..9ec1235be1d22b 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -631,13 +631,10 @@ Status RowGroupReader::_fill_partition_columns( auto& [value, slot_desc] = kv.second; auto _text_serde = slot_desc->get_data_type_ptr()->get_serde(); Slice slice(value.data(), value.size()); - vector slices(rows); - for (int i = 0; i < rows; i++) { - slices[i] = {value.data(), value.size()}; - } int num_deserialized = 0; - if (_text_serde->deserialize_column_from_json_vector(*col_ptr, slices, &num_deserialized, - _text_formatOptions) != Status::OK()) { + if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows, + &num_deserialized, + _text_formatOptions) != Status::OK()) { return Status::InternalError("Failed to fill partition column: {}={}", slot_desc->col_name(), value); } diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index dcfb404ae5a5b3..1738c3fac041b6 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -495,13 +495,10 @@ Status VFileScanner::_fill_columns_from_path(size_t rows) { auto& [value, slot_desc] = kv.second; auto _text_serde = slot_desc->get_data_type_ptr()->get_serde(); Slice slice(value.data(), value.size()); - vector slices(rows); - for (int i = 0; i < rows; i++) { - slices[i] = {value.data(), value.size()}; - } int num_deserialized = 0; - if (_text_serde->deserialize_column_from_json_vector(*col_ptr, slices, &num_deserialized, - _text_formatOptions) != Status::OK()) { + if (_text_serde->deserialize_column_from_fixed_json(*col_ptr, slice, rows, + &num_deserialized, + _text_formatOptions) != Status::OK()) { return Status::InternalError("Failed to fill partition column: {}={}", slot_desc->col_name(), value); }