From d1422cd75510d81a52879865f8547add78852f42 Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 15 Dec 2022 22:39:26 +0800 Subject: [PATCH 01/26] create file and dir --- .../service/src/interpreters/interpreter_update.rs | 13 +++++++++++++ src/query/service/src/interpreters/mod.rs | 1 + src/query/storages/fuse/src/operations/mod.rs | 1 + .../storages/fuse/src/operations/mutation/mod.rs | 1 + .../fuse/src/operations/mutation/update/mod.rs | 13 +++++++++++++ src/query/storages/fuse/src/operations/update.rs | 13 +++++++++++++ 6 files changed, 42 insertions(+) create mode 100644 src/query/service/src/interpreters/interpreter_update.rs create mode 100644 src/query/storages/fuse/src/operations/mutation/update/mod.rs create mode 100644 src/query/storages/fuse/src/operations/update.rs diff --git a/src/query/service/src/interpreters/interpreter_update.rs b/src/query/service/src/interpreters/interpreter_update.rs new file mode 100644 index 000000000000..598edffcf3f6 --- /dev/null +++ b/src/query/service/src/interpreters/interpreter_update.rs @@ -0,0 +1,13 @@ +// Copyright 2021 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/src/query/service/src/interpreters/mod.rs b/src/query/service/src/interpreters/mod.rs index 9174a381e6ca..1f7ecd705f91 100644 --- a/src/query/service/src/interpreters/mod.rs +++ b/src/query/service/src/interpreters/mod.rs @@ -70,6 +70,7 @@ mod interpreter_table_show_create; mod interpreter_table_truncate; mod interpreter_table_undrop; mod interpreter_unsetting; +mod interpreter_update; mod interpreter_use_database; mod interpreter_user_alter; mod interpreter_user_create; diff --git a/src/query/storages/fuse/src/operations/mod.rs b/src/query/storages/fuse/src/operations/mod.rs index 4af104fa8c1b..704251b81359 100644 --- a/src/query/storages/fuse/src/operations/mod.rs +++ b/src/query/storages/fuse/src/operations/mod.rs @@ -26,6 +26,7 @@ mod read_data; mod read_partitions; mod recluster; mod truncate; +mod update; mod fuse_source; mod read; diff --git a/src/query/storages/fuse/src/operations/mutation/mod.rs b/src/query/storages/fuse/src/operations/mutation/mod.rs index d127ac1d6587..d3e82f3f6743 100644 --- a/src/query/storages/fuse/src/operations/mutation/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/mod.rs @@ -19,6 +19,7 @@ mod deletion; pub mod mutation_meta; pub mod mutation_sink; pub mod recluster_mutator; +mod update; pub use abort_operation::AbortOperation; pub use base_mutator::BaseMutator; diff --git a/src/query/storages/fuse/src/operations/mutation/update/mod.rs b/src/query/storages/fuse/src/operations/mutation/update/mod.rs new file mode 100644 index 000000000000..942aecf54b24 --- /dev/null +++ b/src/query/storages/fuse/src/operations/mutation/update/mod.rs @@ -0,0 +1,13 @@ +// Copyright 2022 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/src/query/storages/fuse/src/operations/update.rs b/src/query/storages/fuse/src/operations/update.rs new file mode 100644 index 000000000000..598edffcf3f6 --- /dev/null +++ b/src/query/storages/fuse/src/operations/update.rs @@ -0,0 +1,13 @@ +// Copyright 2021 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. From 23ae3e0cb322bd36bd80fb7afc9579337ad01d9f Mon Sep 17 00:00:00 2001 From: zhyass Date: Wed, 21 Dec 2022 12:01:52 +0800 Subject: [PATCH 02/26] update --- .../src/operations/mutation/update/mod.rs | 2 + .../mutation/update/update_source.rs | 124 ++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 src/query/storages/fuse/src/operations/mutation/update/update_source.rs diff --git a/src/query/storages/fuse/src/operations/mutation/update/mod.rs b/src/query/storages/fuse/src/operations/mutation/update/mod.rs index 942aecf54b24..373fe6fbb636 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/mod.rs @@ -11,3 +11,5 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +mod update_source; \ No newline at end of file diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs new file mode 100644 index 000000000000..43ca564b3ba2 --- /dev/null +++ b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs @@ -0,0 +1,124 @@ +// Copyright 2022 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +use std::any::Any; +use std::ops::Not; +use std::sync::Arc; + +use common_catalog::plan::PartInfoPtr; +use common_catalog::table_context::TableContext; +use common_datablocks::serialize_data_blocks; +use common_datablocks::DataBlock; +use common_datavalues::BooleanColumn; +use common_datavalues::ColumnRef; +use common_datavalues::DataSchemaRef; +use common_datavalues::Series; +use common_exception::ErrorCode; +use common_exception::Result; +use common_sql::evaluator::EvalNode; +use common_storages_table_meta::meta::BlockMeta; +use common_storages_table_meta::meta::ClusterStatistics; +use opendal::Operator; + +use crate::io::write_data; +use crate::io::BlockReader; +use crate::io::TableMetaLocationGenerator; +use crate::operations::util; +use crate::operations::BloomIndexState; +use crate::pipelines::processors::port::OutputPort; +use crate::pipelines::processors::processor::Event; +use crate::pipelines::processors::processor::ProcessorPtr; +use crate::pipelines::processors::Processor; +use crate::pruning::BlockIndex; +use crate::statistics::gen_columns_statistics; +use crate::statistics::ClusterStatsGenerator; +use crate::FuseTable; +use crate::Table; + +enum State { + ReadData(Option), + Output(Option, DataBlock), + Finish, +} + +pub struct UpdateSource { + state: State, + ctx: Arc, + output: Arc, +} + + +#[async_trait::async_trait] +impl Processor for UpdateSource { + fn name(&self) -> String { + "UpdateSource".to_string() + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if matches!(self.state, State::ReadData(None)) { + self.state = match self.ctx.try_get_part() { + None => State::Finish, + Some(part) => State::ReadData(Some(part)), + } + } + + if matches!(self.state, State::Finish) { + self.output.finish(); + return Ok(Event::Finished); + } + + if self.output.is_finished() { + return Ok(Event::Finished); + } + + if !self.output.can_push() { + return Ok(Event::NeedConsume); + } + + if matches!(self.state, State::Output(_, _)) { + if let State::Output(part, data_block) = + std::mem::replace(&mut self.state, State::Finish) + { + self.state = match part { + None => State::Finish, + Some(part) => State::ReadData(Some(part)), + }; + + self.output.push_data(Ok(data_block)); + return Ok(Event::NeedConsume); + } + } + + todo!() + } + + fn process(&mut self) -> Result<()> { + match std::mem::replace(&mut self.state, State::Finish) { + _ => return Err(ErrorCode::Internal("It's a bug.")), + } + Ok(()) + } + + async fn async_process(&mut self) -> Result<()> { + match std::mem::replace(&mut self.state, State::Finish) { + _ => return Err(ErrorCode::Internal("It's a bug.")), + } + Ok(()) + } +} From 77106092e950d081c33e12296b1b400906b9ff84 Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 22 Dec 2022 23:34:13 +0800 Subject: [PATCH 03/26] rename deletepart to mutationpart --- .../storages/fuse/src/operations/delete.rs | 4 ++-- .../mutation/deletion/deletion_source.rs | 4 ++-- .../src/operations/mutation/deletion/mod.rs | 2 -- .../fuse/src/operations/mutation/mod.rs | 3 ++- .../deletion_part.rs => mutation_part.rs} | 18 +++++++++--------- .../fuse/src/operations/mutation/update/mod.rs | 2 +- .../mutation/update/update_source.rs | 17 ++++++++++++++--- 7 files changed, 30 insertions(+), 20 deletions(-) rename src/query/storages/fuse/src/operations/mutation/{deletion/deletion_part.rs => mutation_part.rs} (79%) diff --git a/src/query/storages/fuse/src/operations/delete.rs b/src/query/storages/fuse/src/operations/delete.rs index 034fa850580d..e1e2bebb7d5e 100644 --- a/src/query/storages/fuse/src/operations/delete.rs +++ b/src/query/storages/fuse/src/operations/delete.rs @@ -32,9 +32,9 @@ use common_sql::evaluator::Evaluator; use common_storages_table_meta::meta::Location; use common_storages_table_meta::meta::TableSnapshot; -use crate::operations::mutation::DeletionPartInfo; use crate::operations::mutation::DeletionSource; use crate::operations::mutation::DeletionTransform; +use crate::operations::mutation::MutationPartInfo; use crate::operations::mutation::MutationSink; use crate::pipelines::processors::port::InputPort; use crate::pipelines::processors::port::OutputPort; @@ -186,7 +186,7 @@ impl FuseTable { index_stats .into_iter() .zip(inner_parts.partitions.into_iter()) - .map(|((a, b), c)| DeletionPartInfo::create(a, b, c)) + .map(|((a, b), c)| MutationPartInfo::create(a, b, c)) .collect(), ); ctx.try_set_partitions(parts)?; diff --git a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs b/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs index e5fb00ed55a6..a25b01c7f8e4 100644 --- a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs @@ -34,10 +34,10 @@ use opendal::Operator; use super::deletion_meta::Deletion; use super::deletion_meta::DeletionSourceMeta; -use super::deletion_part::DeletionPartInfo; use crate::io::write_data; use crate::io::BlockReader; use crate::io::TableMetaLocationGenerator; +use crate::operations::mutation::MutationPartInfo; use crate::operations::util; use crate::operations::BloomIndexState; use crate::pipelines::processors::port::OutputPort; @@ -302,7 +302,7 @@ impl Processor for DeletionSource { async fn async_process(&mut self) -> Result<()> { match std::mem::replace(&mut self.state, State::Finish) { State::ReadData(Some(part)) => { - let deletion_part = DeletionPartInfo::from_part(&part)?; + let deletion_part = MutationPartInfo::from_part(&part)?; self.index = deletion_part.index; self.origin_stats = deletion_part.cluster_stats.clone(); let part = deletion_part.inner_part.clone(); diff --git a/src/query/storages/fuse/src/operations/mutation/deletion/mod.rs b/src/query/storages/fuse/src/operations/mutation/deletion/mod.rs index c6c06846a3f7..d4efa10b1622 100644 --- a/src/query/storages/fuse/src/operations/mutation/deletion/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/deletion/mod.rs @@ -13,11 +13,9 @@ // limitations under the License. mod deletion_meta; -mod deletion_part; mod deletion_source; mod deletion_transform; pub use deletion_meta::Deletion; -pub use deletion_part::DeletionPartInfo; pub use deletion_source::DeletionSource; pub use deletion_transform::DeletionTransform; diff --git a/src/query/storages/fuse/src/operations/mutation/mod.rs b/src/query/storages/fuse/src/operations/mutation/mod.rs index d3e82f3f6743..8694e0f6e57c 100644 --- a/src/query/storages/fuse/src/operations/mutation/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/mod.rs @@ -17,6 +17,7 @@ pub mod base_mutator; mod compact; mod deletion; pub mod mutation_meta; +mod mutation_part; pub mod mutation_sink; pub mod recluster_mutator; mod update; @@ -30,9 +31,9 @@ pub use compact::MergeSegmentsTransform; pub use compact::SegmentCompactMutator; pub use compact::SegmentCompactionState; pub use compact::SegmentCompactor; -pub use deletion::DeletionPartInfo; pub use deletion::DeletionSource; pub use deletion::DeletionTransform; pub use mutation_meta::MutationMeta; +pub use mutation_part::MutationPartInfo; pub use mutation_sink::MutationSink; pub use recluster_mutator::ReclusterMutator; diff --git a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_part.rs b/src/query/storages/fuse/src/operations/mutation/mutation_part.rs similarity index 79% rename from src/query/storages/fuse/src/operations/mutation/deletion/deletion_part.rs rename to src/query/storages/fuse/src/operations/mutation/mutation_part.rs index bb2fda70d67d..2dad0c5b1221 100644 --- a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_part.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_part.rs @@ -24,20 +24,20 @@ use common_storages_table_meta::meta::ClusterStatistics; use crate::pruning::BlockIndex; #[derive(serde::Serialize, serde::Deserialize, PartialEq)] -pub struct DeletionPartInfo { +pub struct MutationPartInfo { pub index: BlockIndex, pub cluster_stats: Option, pub inner_part: PartInfoPtr, } -#[typetag::serde(name = "deletion")] -impl PartInfo for DeletionPartInfo { +#[typetag::serde(name = "mutation")] +impl PartInfo for MutationPartInfo { fn as_any(&self) -> &dyn Any { self } fn equals(&self, info: &Box) -> bool { - match info.as_any().downcast_ref::() { + match info.as_any().downcast_ref::() { None => false, Some(other) => self == other, } @@ -48,24 +48,24 @@ impl PartInfo for DeletionPartInfo { } } -impl DeletionPartInfo { +impl MutationPartInfo { pub fn create( index: BlockIndex, cluster_stats: Option, inner_part: PartInfoPtr, ) -> PartInfoPtr { - Arc::new(Box::new(DeletionPartInfo { + Arc::new(Box::new(MutationPartInfo { index, cluster_stats, inner_part, })) } - pub fn from_part(info: &PartInfoPtr) -> Result<&DeletionPartInfo> { - match info.as_any().downcast_ref::() { + pub fn from_part(info: &PartInfoPtr) -> Result<&MutationPartInfo> { + match info.as_any().downcast_ref::() { Some(part_ref) => Ok(part_ref), None => Err(ErrorCode::Internal( - "Cannot downcast from PartInfo to DeletionPartInfo.", + "Cannot downcast from PartInfo to MutationPartInfo.", )), } } diff --git a/src/query/storages/fuse/src/operations/mutation/update/mod.rs b/src/query/storages/fuse/src/operations/mutation/update/mod.rs index 373fe6fbb636..25ddbfd26af4 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/mod.rs @@ -12,4 +12,4 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod update_source; \ No newline at end of file +mod update_source; diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs index 43ca564b3ba2..1255e61ab03e 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs @@ -12,14 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. - use std::any::Any; use std::ops::Not; use std::sync::Arc; use common_catalog::plan::PartInfoPtr; use common_catalog::table_context::TableContext; -use common_datablocks::serialize_data_blocks; +use common_datablocks::serialize_to_parquet; use common_datablocks::DataBlock; use common_datavalues::BooleanColumn; use common_datavalues::ColumnRef; @@ -35,6 +34,7 @@ use opendal::Operator; use crate::io::write_data; use crate::io::BlockReader; use crate::io::TableMetaLocationGenerator; +use crate::operations::mutation::MutationPartInfo; use crate::operations::util; use crate::operations::BloomIndexState; use crate::pipelines::processors::port::OutputPort; @@ -57,8 +57,11 @@ pub struct UpdateSource { state: State, ctx: Arc, output: Arc, -} + index: BlockIndex, + cluster_stats_gen: ClusterStatsGenerator, + origin_stats: Option, +} #[async_trait::async_trait] impl Processor for UpdateSource { @@ -117,6 +120,14 @@ impl Processor for UpdateSource { async fn async_process(&mut self) -> Result<()> { match std::mem::replace(&mut self.state, State::Finish) { + State::ReadData(Some(part)) => { + let part = MutationPartInfo::from_part(&part)?; + self.index = part.index; + self.origin_stats = part.cluster_stats.clone(); + let inner_part = part.inner_part.clone(); + // let chunks = self.block_reader.read_columns_data(inner_part.clone()).await?; + // self.state = State::FilterData(inner_part, chunks); + } _ => return Err(ErrorCode::Internal("It's a bug.")), } Ok(()) From d5eed3aa2c4511764028fa153c0cff4973f6d642 Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 29 Dec 2022 20:43:02 +0800 Subject: [PATCH 04/26] update source --- .../storages/fuse/src/operations/delete.rs | 4 +- .../fuse/src/operations/mutation/mod.rs | 1 + .../src/operations/mutation/update/mod.rs | 3 + .../operations/mutation/update/update_meta.rs | 59 +++++ .../mutation/update/update_source.rs | 215 +++++++++++++++-- .../storages/fuse/src/operations/update.rs | 222 ++++++++++++++++++ 6 files changed, 487 insertions(+), 17 deletions(-) create mode 100644 src/query/storages/fuse/src/operations/mutation/update/update_meta.rs diff --git a/src/query/storages/fuse/src/operations/delete.rs b/src/query/storages/fuse/src/operations/delete.rs index e1e2bebb7d5e..1d065f0dba4f 100644 --- a/src/query/storages/fuse/src/operations/delete.rs +++ b/src/query/storages/fuse/src/operations/delete.rs @@ -124,7 +124,7 @@ impl FuseTable { Ok(()) } - fn try_eval_const(&self, filter: &Expression) -> Result { + pub fn try_eval_const(&self, filter: &Expression) -> Result { let func_ctx = FunctionContext::default(); let dummy_field = DataField::new("dummy", NullType::new_impl()); @@ -176,7 +176,7 @@ impl FuseTable { let (_, inner_parts) = self.read_partitions_with_metas( ctx.clone(), self.table_info.schema(), - None, + push_down, metas, base_snapshot.summary.block_count as usize, )?; diff --git a/src/query/storages/fuse/src/operations/mutation/mod.rs b/src/query/storages/fuse/src/operations/mutation/mod.rs index 8694e0f6e57c..b51360c19a64 100644 --- a/src/query/storages/fuse/src/operations/mutation/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/mod.rs @@ -37,3 +37,4 @@ pub use mutation_meta::MutationMeta; pub use mutation_part::MutationPartInfo; pub use mutation_sink::MutationSink; pub use recluster_mutator::ReclusterMutator; +pub use update::UpdateSource; diff --git a/src/query/storages/fuse/src/operations/mutation/update/mod.rs b/src/query/storages/fuse/src/operations/mutation/update/mod.rs index 25ddbfd26af4..37ca090672ba 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/mod.rs @@ -12,4 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod update_meta; mod update_source; + +pub use update_source::UpdateSource; \ No newline at end of file diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_meta.rs b/src/query/storages/fuse/src/operations/mutation/update/update_meta.rs new file mode 100644 index 000000000000..692986b25e42 --- /dev/null +++ b/src/query/storages/fuse/src/operations/mutation/update/update_meta.rs @@ -0,0 +1,59 @@ +// Copyright 2022 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::sync::Arc; + +use common_datablocks::BlockMetaInfo; +use common_datablocks::BlockMetaInfoPtr; +use common_exception::ErrorCode; +use common_exception::Result; +use common_storages_table_meta::meta::BlockMeta; + +use crate::pruning::BlockIndex; + +#[derive(serde::Serialize, serde::Deserialize, Debug, PartialEq)] +pub struct UpdateSourceMeta { + pub index: BlockIndex, + pub replace: Arc, +} + +#[typetag::serde(name = "update_source_meta")] +impl BlockMetaInfo for UpdateSourceMeta { + fn as_any(&self) -> &dyn Any { + self + } + + fn equals(&self, info: &Box) -> bool { + match info.as_any().downcast_ref::() { + None => false, + Some(other) => self == other, + } + } +} + +impl UpdateSourceMeta { + pub fn create(index: BlockIndex, replace: Arc) -> BlockMetaInfoPtr { + Arc::new(Box::new(UpdateSourceMeta { index, replace })) + } + + pub fn from_meta(info: &BlockMetaInfoPtr) -> Result<&UpdateSourceMeta> { + match info.as_any().downcast_ref::() { + Some(part_ref) => Ok(part_ref), + None => Err(ErrorCode::Internal( + "Cannot downcast from BlockMetaInfo to UpdateSourceMeta.", + )), + } + } +} diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs index 1255e61ab03e..faece7d7351d 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs @@ -13,24 +13,22 @@ // limitations under the License. use std::any::Any; -use std::ops::Not; use std::sync::Arc; use common_catalog::plan::PartInfoPtr; use common_catalog::table_context::TableContext; use common_datablocks::serialize_to_parquet; use common_datablocks::DataBlock; -use common_datavalues::BooleanColumn; -use common_datavalues::ColumnRef; -use common_datavalues::DataSchemaRef; -use common_datavalues::Series; +use common_datavalues::DataField; +use common_datavalues::ToDataType; use common_exception::ErrorCode; use common_exception::Result; +use common_sql::evaluator::ChunkOperator; use common_sql::evaluator::EvalNode; use common_storages_table_meta::meta::BlockMeta; -use common_storages_table_meta::meta::ClusterStatistics; use opendal::Operator; +use super::update_meta::UpdateSourceMeta; use crate::io::write_data; use crate::io::BlockReader; use crate::io::TableMetaLocationGenerator; @@ -43,12 +41,28 @@ use crate::pipelines::processors::processor::ProcessorPtr; use crate::pipelines::processors::Processor; use crate::pruning::BlockIndex; use crate::statistics::gen_columns_statistics; -use crate::statistics::ClusterStatsGenerator; -use crate::FuseTable; -use crate::Table; + +type DataChunks = Vec<(usize, Vec)>; +struct SerializeState { + block_data: Vec, + block_location: String, + index_data: Vec, + index_location: String, +} enum State { ReadData(Option), + FilterData(PartInfoPtr, DataChunks), + ReadRemain(PartInfoPtr, DataBlock), + MergeRemain { + part: PartInfoPtr, + chunks: DataChunks, + data_block: DataBlock, + }, + UpdateData(DataBlock), + NeedSerialize(DataBlock), + Serialized(SerializeState, Arc), + Generated(Arc), Output(Option, DataBlock), Finish, } @@ -57,10 +71,42 @@ pub struct UpdateSource { state: State, ctx: Arc, output: Arc, + location_gen: TableMetaLocationGenerator, + dal: Operator, + + block_reader: Arc, + filter: Arc>, + remain_reader: Arc>, + operators: Vec, index: BlockIndex, - cluster_stats_gen: ClusterStatsGenerator, - origin_stats: Option, +} + +impl UpdateSource { + #[allow(clippy::too_many_arguments)] + pub fn try_create( + ctx: Arc, + output: Arc, + location_gen: TableMetaLocationGenerator, + dal: Operator, + block_reader: Arc, + filter: Arc>, + remain_reader: Arc>, + operators: Vec, + ) -> Result { + Ok(ProcessorPtr::create(Box::new(UpdateSource { + state: State::ReadData(None), + ctx: ctx.clone(), + output, + location_gen, + dal, + block_reader, + filter, + remain_reader, + operators, + index: (0, 0), + }))) + } } #[async_trait::async_trait] @@ -108,11 +154,117 @@ impl Processor for UpdateSource { } } - todo!() + if matches!( + self.state, + State::ReadData(_) | State::ReadRemain { .. } | State::Serialized(_, _) + ) { + Ok(Event::Async) + } else { + Ok(Event::Sync) + } } fn process(&mut self) -> Result<()> { match std::mem::replace(&mut self.state, State::Finish) { + State::FilterData(part, chunks) => { + let mut data_block = self.block_reader.deserialize(part.clone(), chunks)?; + if let Some(filter) = self.filter.as_ref() { + let filter_result = filter + .eval(&self.ctx.try_get_function_context()?, &data_block)? + .vector; + let predicates = DataBlock::cast_to_nonull_boolean(&filter_result)?; + if DataBlock::filter_exists(&predicates)? { + let field = DataField::new("_predicate", bool::to_data_type()); + data_block = data_block.add_column(predicates, field)?; + if self.remain_reader.is_none() { + self.state = State::UpdateData(data_block); + } else { + self.state = State::ReadRemain(part, data_block); + } + } else { + let new_part = self.ctx.try_get_part(); + self.state = State::Output(new_part, DataBlock::empty()); + } + } else { + self.state = State::UpdateData(data_block); + } + } + State::MergeRemain { + part, + chunks, + mut data_block, + } => { + let merged = if chunks.is_empty() { + data_block + } else if let Some(remain_reader) = self.remain_reader.as_ref() { + let remain_block = remain_reader.deserialize(part, chunks)?; + for (col, field) in remain_block + .columns() + .iter() + .zip(remain_block.schema().fields()) + { + data_block = data_block.add_column(col.clone(), field.clone())?; + } + data_block + } else { + return Err(ErrorCode::Internal("It's a bug. Need remain reader")); + }; + self.state = State::UpdateData(merged); + } + State::UpdateData(data_block) => { + let func_ctx = self.ctx.try_get_function_context()?; + let block = self + .operators + .iter() + .try_fold(data_block, |input, op| op.execute(&func_ctx, input))?; + self.state = State::NeedSerialize(block); + } + State::NeedSerialize(block) => { + let row_count = block.num_rows() as u64; + let block_size = block.memory_size() as u64; + let (block_location, block_id) = self.location_gen.gen_block_location(); + + // build block index. + let location = self.location_gen.block_bloom_index_location(&block_id); + let (bloom_index_state, column_distinct_count) = + BloomIndexState::try_create(&block, location)?; + let col_stats = gen_columns_statistics(&block, Some(column_distinct_count))?; + + // serialize data block. + let mut block_data = Vec::with_capacity(100 * 1024 * 1024); + let schema = block.schema().clone(); + let (file_size, meta_data) = + serialize_to_parquet(vec![block], &schema, &mut block_data)?; + let col_metas = util::column_metas(&meta_data)?; + + // new block meta. + let new_meta = Arc::new(BlockMeta::new( + row_count, + block_size, + file_size, + col_stats, + col_metas, + None, + block_location.clone(), + Some(bloom_index_state.location.clone()), + bloom_index_state.size, + )); + + self.state = State::Serialized( + SerializeState { + block_data, + block_location: block_location.0, + index_data: bloom_index_state.data, + index_location: bloom_index_state.location.0, + }, + new_meta, + ); + } + State::Generated(replace) => { + let meta = UpdateSourceMeta::create(self.index, replace); + let new_part = self.ctx.try_get_part(); + self.state = State::Output(new_part, DataBlock::empty_with_meta(meta)); + } _ => return Err(ErrorCode::Internal("It's a bug.")), } Ok(()) @@ -123,10 +275,43 @@ impl Processor for UpdateSource { State::ReadData(Some(part)) => { let part = MutationPartInfo::from_part(&part)?; self.index = part.index; - self.origin_stats = part.cluster_stats.clone(); let inner_part = part.inner_part.clone(); - // let chunks = self.block_reader.read_columns_data(inner_part.clone()).await?; - // self.state = State::FilterData(inner_part, chunks); + let chunks = self + .block_reader + .read_columns_data(self.ctx.clone(), inner_part.clone()) + .await?; + self.state = State::FilterData(inner_part, chunks); + } + State::ReadRemain(part, data_block) => { + if let Some(remain_reader) = self.remain_reader.as_ref() { + let chunks = remain_reader + .read_columns_data(self.ctx.clone(), part.clone()) + .await?; + self.state = State::MergeRemain { + part, + chunks, + data_block, + }; + } else { + return Err(ErrorCode::Internal("It's a bug. No remain reader")); + } + } + State::Serialized(serialize_state, block_meta) => { + // write block data. + write_data( + &serialize_state.block_data, + &self.dal, + &serialize_state.block_location, + ) + .await?; + // write index data. + write_data( + &serialize_state.index_data, + &self.dal, + &serialize_state.index_location, + ) + .await?; + self.state = State::Generated(block_meta); } _ => return Err(ErrorCode::Internal("It's a bug.")), } diff --git a/src/query/storages/fuse/src/operations/update.rs b/src/query/storages/fuse/src/operations/update.rs index 598edffcf3f6..04c222bcf05c 100644 --- a/src/query/storages/fuse/src/operations/update.rs +++ b/src/query/storages/fuse/src/operations/update.rs @@ -11,3 +11,225 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +use std::collections::BTreeMap; +use std::collections::HashMap; +use std::sync::Arc; + +use common_catalog::plan::Expression; +use common_catalog::plan::Partitions; +use common_catalog::plan::PartitionsShuffleKind; +use common_catalog::plan::Projection; +use common_catalog::plan::PushDownInfo; +use common_catalog::table::Table; +use common_catalog::table_context::TableContext; +use common_datavalues::prelude::*; +use common_exception::Result; +use common_sql::evaluator::ChunkOperator; +use common_sql::evaluator::Evaluator; + +use crate::operations::mutation::MutationPartInfo; +use crate::pipelines::Pipeline; +use crate::pruning::BlockPruner; +use crate::FuseTable; + +use crate::operations::mutation::UpdateSource; + +impl FuseTable { + pub async fn do_update( + &self, + ctx: Arc, + filter: Option, + col_indices: Vec, + update_list: HashMap, + pipeline: &mut Pipeline, + ) -> Result<()> { + let snapshot_opt = self.read_table_snapshot().await?; + + // check if table is empty + let snapshot = if let Some(val) = snapshot_opt { + val + } else { + // no snapshot, no update + return Ok(()); + }; + + if snapshot.summary.row_count == 0 { + // empty snapshot, no update + return Ok(()); + } + + let all_col_ids = self.all_the_columns_ids(); + let schema = self.schema(); + let mut operators = Vec::with_capacity(update_list.len() + 2); + let mut offset_map = BTreeMap::new(); + let mut remain_reader = None; + let (projection, filters) = if col_indices.is_empty() { + if filter.is_some() && !self.try_eval_const(&filter.unwrap())? { + // do nothing. + return Ok(()); + } + + let mut pos = 0; + offset_map = all_col_ids.iter().fold(offset_map, |mut acc, id| { + acc.insert(*id, pos); + pos += 1; + acc + }); + + for (id, expr) in update_list.into_iter() { + let field = schema.field(id); + let target = field.data_type(); + let new_expr = Expression::Cast { + input: Box::new(expr), + target: target.clone(), + }; + operators.push(ChunkOperator::Map { + eval: Evaluator::eval_expression(&new_expr, &schema)?, + name: format!("new_{}", field.name()), + }); + offset_map.insert(id, pos); + pos += 1; + } + (Projection::Columns(all_col_ids), vec![]) + } else { + let mut pos = 0; + offset_map = col_indices.iter().fold(offset_map, |mut acc, id| { + acc.insert(*id, pos); + pos += 1; + acc + }); + + let mut fields = schema.fields().clone(); + fields.push(DataField::new("_predicate", bool::to_data_type())); + let input_schema = Arc::new(DataSchema::new(fields)); + pos += 1; + + let remain_col_ids: Vec = all_col_ids + .into_iter() + .filter(|id| !col_indices.contains(id)) + .collect(); + if !remain_col_ids.is_empty() { + offset_map = remain_col_ids.iter().fold(offset_map, |mut acc, id| { + acc.insert(*id, pos); + pos += 1; + acc + }); + + remain_reader = + Some((*self.create_block_reader(Projection::Columns(remain_col_ids))?).clone()); + } + + for (id, expr) in update_list.into_iter() { + let field = schema.field(id); + let target = field.data_type(); + let new_expr = Expression::Function { + name: "if".to_string(), + args: vec![ + Expression::IndexedVariable { + name: "_predicate".to_string(), + data_type: bool::to_data_type(), + }, + Expression::Cast { + input: Box::new(expr), + target: target.clone(), + }, + Expression::IndexedVariable { + name: field.name().clone(), + data_type: target.clone(), + }, + ], + return_type: target.clone(), + }; + operators.push(ChunkOperator::Map { + eval: Evaluator::eval_expression(&new_expr, &input_schema)?, + name: format!("new_{}", field.name()), + }); + offset_map.insert(id, pos); + pos += 1; + } + + (Projection::Columns(col_indices.clone()), vec![ + filter.unwrap().clone(), + ]) + }; + + let offsets = offset_map.values().cloned().collect::>(); + operators.push(ChunkOperator::Project { offsets }); + operators.push(ChunkOperator::Rename { + output_schema: schema, + }); + + let block_reader = self.create_block_reader(projection.clone())?; + let eval_node = if filters.is_empty() { + Arc::new(None) + } else { + Arc::new(Some(Evaluator::eval_expression( + &filters[0], + block_reader.schema().as_ref(), + )?)) + }; + + let remain_reader = Arc::new(remain_reader); + + let push_down = Some(PushDownInfo { + projection: Some(projection), + filters, + ..PushDownInfo::default() + }); + + let segments_location = snapshot.segments.clone(); + let block_metas = BlockPruner::prune( + &ctx, + self.operator.clone(), + self.table_info.schema(), + &push_down, + segments_location, + ) + .await?; + + let mut indices = Vec::with_capacity(block_metas.len()); + let mut metas = Vec::with_capacity(block_metas.len()); + block_metas.into_iter().for_each(|(index, block_meta)| { + indices.push(index); + metas.push(block_meta); + }); + + let (_, inner_parts) = self.read_partitions_with_metas( + ctx.clone(), + self.table_info.schema(), + push_down, + metas, + snapshot.summary.block_count as usize, + )?; + + let parts = Partitions::create( + PartitionsShuffleKind::Mod, + indices + .into_iter() + .zip(inner_parts.partitions.into_iter()) + .map(|(a, b)| MutationPartInfo::create(a, None, b)) + .collect(), + ); + ctx.try_set_partitions(parts)?; + + let max_threads = ctx.get_settings().get_max_threads()? as usize; + // Add source pipe. + pipeline.add_source( + |output| { + UpdateSource::try_create( + ctx.clone(), + output, + self.meta_location_generator().clone(), + self.get_operator(), + block_reader.clone(), + eval_node.clone(), + remain_reader.clone(), + operators.clone(), + ) + }, + max_threads, + )?; + Ok(()) + } +} From d828040656b5b0cd91341dcf69c4ae0a16d12639 Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 29 Dec 2022 21:57:08 +0800 Subject: [PATCH 05/26] make lint --- .../src/operations/mutation/update/mod.rs | 2 +- .../operations/mutation/update/update_meta.rs | 12 ++++++++-- .../mutation/update/update_source.rs | 23 ++++++++++++------- .../storages/fuse/src/operations/update.rs | 6 ++--- 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/query/storages/fuse/src/operations/mutation/update/mod.rs b/src/query/storages/fuse/src/operations/mutation/update/mod.rs index 37ca090672ba..d79f2cb34494 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/mod.rs @@ -15,4 +15,4 @@ mod update_meta; mod update_source; -pub use update_source::UpdateSource; \ No newline at end of file +pub use update_source::UpdateSource; diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_meta.rs b/src/query/storages/fuse/src/operations/mutation/update/update_meta.rs index 692986b25e42..a40db4c80703 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/update_meta.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/update_meta.rs @@ -23,7 +23,7 @@ use common_storages_table_meta::meta::BlockMeta; use crate::pruning::BlockIndex; -#[derive(serde::Serialize, serde::Deserialize, Debug, PartialEq)] +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)] pub struct UpdateSourceMeta { pub index: BlockIndex, pub replace: Arc, @@ -35,6 +35,14 @@ impl BlockMetaInfo for UpdateSourceMeta { self } + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn clone_self(&self) -> Box { + Box::new(self.clone()) + } + fn equals(&self, info: &Box) -> bool { match info.as_any().downcast_ref::() { None => false, @@ -45,7 +53,7 @@ impl BlockMetaInfo for UpdateSourceMeta { impl UpdateSourceMeta { pub fn create(index: BlockIndex, replace: Arc) -> BlockMetaInfoPtr { - Arc::new(Box::new(UpdateSourceMeta { index, replace })) + Box::new(UpdateSourceMeta { index, replace }) } pub fn from_meta(info: &BlockMetaInfoPtr) -> Result<&UpdateSourceMeta> { diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs index faece7d7351d..e0e696bc87bb 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs @@ -17,7 +17,6 @@ use std::sync::Arc; use common_catalog::plan::PartInfoPtr; use common_catalog::table_context::TableContext; -use common_datablocks::serialize_to_parquet; use common_datablocks::DataBlock; use common_datavalues::DataField; use common_datavalues::ToDataType; @@ -25,7 +24,9 @@ use common_exception::ErrorCode; use common_exception::Result; use common_sql::evaluator::ChunkOperator; use common_sql::evaluator::EvalNode; +use common_storages_common::blocks_to_parquet; use common_storages_table_meta::meta::BlockMeta; +use common_storages_table_meta::table::TableCompression; use opendal::Operator; use super::update_meta::UpdateSourceMeta; @@ -41,6 +42,7 @@ use crate::pipelines::processors::processor::ProcessorPtr; use crate::pipelines::processors::Processor; use crate::pruning::BlockIndex; use crate::statistics::gen_columns_statistics; +use crate::FuseTable; type DataChunks = Vec<(usize, Vec)>; struct SerializeState { @@ -73,6 +75,7 @@ pub struct UpdateSource { output: Arc, location_gen: TableMetaLocationGenerator, dal: Operator, + table_compression: TableCompression, block_reader: Arc, filter: Arc>, @@ -83,12 +86,10 @@ pub struct UpdateSource { } impl UpdateSource { - #[allow(clippy::too_many_arguments)] pub fn try_create( ctx: Arc, output: Arc, - location_gen: TableMetaLocationGenerator, - dal: Operator, + table: &FuseTable, block_reader: Arc, filter: Arc>, remain_reader: Arc>, @@ -98,8 +99,9 @@ impl UpdateSource { state: State::ReadData(None), ctx: ctx.clone(), output, - location_gen, - dal, + location_gen: table.meta_location_generator().clone(), + dal: table.get_operator(), + table_compression: table.table_compression, block_reader, filter, remain_reader, @@ -233,8 +235,12 @@ impl Processor for UpdateSource { // serialize data block. let mut block_data = Vec::with_capacity(100 * 1024 * 1024); let schema = block.schema().clone(); - let (file_size, meta_data) = - serialize_to_parquet(vec![block], &schema, &mut block_data)?; + let (file_size, meta_data) = blocks_to_parquet( + &schema, + vec![block], + &mut block_data, + self.table_compression, + )?; let col_metas = util::column_metas(&meta_data)?; // new block meta. @@ -248,6 +254,7 @@ impl Processor for UpdateSource { block_location.clone(), Some(bloom_index_state.location.clone()), bloom_index_state.size, + self.table_compression.into(), )); self.state = State::Serialized( diff --git a/src/query/storages/fuse/src/operations/update.rs b/src/query/storages/fuse/src/operations/update.rs index 04c222bcf05c..bffd59e0cee7 100644 --- a/src/query/storages/fuse/src/operations/update.rs +++ b/src/query/storages/fuse/src/operations/update.rs @@ -29,12 +29,11 @@ use common_sql::evaluator::ChunkOperator; use common_sql::evaluator::Evaluator; use crate::operations::mutation::MutationPartInfo; +use crate::operations::mutation::UpdateSource; use crate::pipelines::Pipeline; use crate::pruning::BlockPruner; use crate::FuseTable; -use crate::operations::mutation::UpdateSource; - impl FuseTable { pub async fn do_update( &self, @@ -220,8 +219,7 @@ impl FuseTable { UpdateSource::try_create( ctx.clone(), output, - self.meta_location_generator().clone(), - self.get_operator(), + self, block_reader.clone(), eval_node.clone(), remain_reader.clone(), From 5de1ef58390dfe7ed197e61195c0fee6a8aa47dd Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 29 Dec 2022 22:47:50 +0800 Subject: [PATCH 06/26] rename deletion_transform to mutation_transform --- .../storages/fuse/src/operations/delete.rs | 10 +-- .../compact/merge_segments_transform.rs | 4 +- .../mutation/deletion/deletion_meta.rs | 74 ------------------- .../mutation/deletion/deletion_source.rs | 14 ++-- .../src/operations/mutation/deletion/mod.rs | 4 - .../fuse/src/operations/mutation/mod.rs | 7 +- .../src/operations/mutation/mutation_meta.rs | 71 +++++++++++++++--- .../src/operations/mutation/mutation_sink.rs | 4 +- ...ion_transform.rs => mutation_transform.rs} | 32 ++++---- .../src/operations/mutation/update/mod.rs | 1 - .../operations/mutation/update/update_meta.rs | 67 ----------------- .../mutation/update/update_source.rs | 14 ++-- .../storages/fuse/src/operations/update.rs | 8 +- 13 files changed, 113 insertions(+), 197 deletions(-) delete mode 100644 src/query/storages/fuse/src/operations/mutation/deletion/deletion_meta.rs rename src/query/storages/fuse/src/operations/mutation/{deletion/deletion_transform.rs => mutation_transform.rs} (93%) delete mode 100644 src/query/storages/fuse/src/operations/mutation/update/update_meta.rs diff --git a/src/query/storages/fuse/src/operations/delete.rs b/src/query/storages/fuse/src/operations/delete.rs index 1d065f0dba4f..16c18a76d03a 100644 --- a/src/query/storages/fuse/src/operations/delete.rs +++ b/src/query/storages/fuse/src/operations/delete.rs @@ -33,9 +33,9 @@ use common_storages_table_meta::meta::Location; use common_storages_table_meta::meta::TableSnapshot; use crate::operations::mutation::DeletionSource; -use crate::operations::mutation::DeletionTransform; use crate::operations::mutation::MutationPartInfo; use crate::operations::mutation::MutationSink; +use crate::operations::mutation::MutationTransform; use crate::pipelines::processors::port::InputPort; use crate::pipelines::processors::port::OutputPort; use crate::pipelines::Pipe; @@ -49,7 +49,7 @@ impl FuseTable { /// +---------------+ /// |DeletionSource1| ------ /// +---------------+ | +-----------------+ +------------+ - /// | ... | ... | ---> |DeletionTransform| ---> |MutationSink| + /// | ... | ... | ---> |MutationTransform| ---> |MutationSink| /// +---------------+ | +-----------------+ +------------+ /// |DeletionSourceN| ------ /// +---------------+ @@ -116,7 +116,7 @@ impl FuseTable { self.try_add_deletion_source(ctx.clone(), &filter_expr, col_indices, &snapshot, pipeline) .await?; - self.try_add_deletion_transform(ctx.clone(), snapshot.segments.clone(), pipeline)?; + self.try_add_mutation_transform(ctx.clone(), snapshot.segments.clone(), pipeline)?; pipeline.add_sink(|input| { MutationSink::try_create(self, ctx.clone(), snapshot.clone(), input) @@ -227,7 +227,7 @@ impl FuseTable { ) } - fn try_add_deletion_transform( + pub fn try_add_mutation_transform( &self, ctx: Arc, base_segments: Vec, @@ -245,7 +245,7 @@ impl FuseTable { inputs_port.push(InputPort::create()); } let output_port = OutputPort::create(); - let processor = DeletionTransform::try_create( + let processor = MutationTransform::try_create( ctx, inputs_port.clone(), output_port.clone(), diff --git a/src/query/storages/fuse/src/operations/mutation/compact/merge_segments_transform.rs b/src/query/storages/fuse/src/operations/mutation/compact/merge_segments_transform.rs index 5a1f270e99eb..bbefa5377d8c 100644 --- a/src/query/storages/fuse/src/operations/mutation/compact/merge_segments_transform.rs +++ b/src/query/storages/fuse/src/operations/mutation/compact/merge_segments_transform.rs @@ -27,7 +27,7 @@ use itertools::Itertools; use crate::operations::mutation::compact::CompactSinkMeta; use crate::operations::mutation::AbortOperation; use crate::operations::mutation::BlockCompactMutator; -use crate::operations::mutation::MutationMeta; +use crate::operations::mutation::MutationSinkMeta; use crate::pipelines::processors::port::InputPort; use crate::pipelines::processors::port::OutputPort; use crate::pipelines::processors::processor::Event; @@ -172,7 +172,7 @@ impl Processor for MergeSegmentsTransform { .sorted_by_key(|&(_, r)| *r) .map(|(l, _)| l) .collect(); - let meta = MutationMeta::create( + let meta = MutationSinkMeta::create( merged_segments, std::mem::take(&mut self.merged_statistics), std::mem::take(&mut self.abort_operation), diff --git a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_meta.rs b/src/query/storages/fuse/src/operations/mutation/deletion/deletion_meta.rs deleted file mode 100644 index a8a27da79a40..000000000000 --- a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_meta.rs +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::sync::Arc; - -use common_datablocks::BlockMetaInfo; -use common_datablocks::BlockMetaInfoPtr; -use common_exception::ErrorCode; -use common_exception::Result; -use common_storages_table_meta::meta::BlockMeta; - -use crate::pruning::BlockIndex; - -#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] -pub enum Deletion { - DoNothing, - Replaced(Arc), - Deleted, -} - -#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)] -pub struct DeletionSourceMeta { - pub index: BlockIndex, - pub op: Deletion, -} - -#[typetag::serde(name = "deletion_source_meta")] -impl BlockMetaInfo for DeletionSourceMeta { - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn clone_self(&self) -> Box { - Box::new(self.clone()) - } - - fn equals(&self, info: &Box) -> bool { - match info.as_any().downcast_ref::() { - None => false, - Some(other) => self == other, - } - } -} - -impl DeletionSourceMeta { - pub fn create(index: BlockIndex, op: Deletion) -> BlockMetaInfoPtr { - Box::new(DeletionSourceMeta { index, op }) - } - - pub fn from_meta(info: &BlockMetaInfoPtr) -> Result<&DeletionSourceMeta> { - match info.as_any().downcast_ref::() { - Some(part_ref) => Ok(part_ref), - None => Err(ErrorCode::Internal( - "Cannot downcast from BlockMetaInfo to DeletionSourceMeta.", - )), - } - } -} diff --git a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs b/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs index a25b01c7f8e4..18807c5cf09e 100644 --- a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs @@ -32,12 +32,12 @@ use common_storages_table_meta::meta::ClusterStatistics; use common_storages_table_meta::table::TableCompression; use opendal::Operator; -use super::deletion_meta::Deletion; -use super::deletion_meta::DeletionSourceMeta; use crate::io::write_data; use crate::io::BlockReader; use crate::io::TableMetaLocationGenerator; +use crate::operations::mutation::Mutation; use crate::operations::mutation::MutationPartInfo; +use crate::operations::mutation::MutationSourceMeta; use crate::operations::util; use crate::operations::BloomIndexState; use crate::pipelines::processors::port::OutputPort; @@ -75,7 +75,7 @@ enum State { }, NeedSerialize(DataBlock), Serialized(SerializeState, Arc), - Generated(Deletion), + Generated(Mutation), Output(Option, DataBlock), Finish, } @@ -194,13 +194,13 @@ impl Processor for DeletionSource { let filter: ColumnRef = Arc::new(BooleanColumn::from_arrow_data(values.not())); if !DataBlock::filter_exists(&filter)? { // all the rows should be removed. - self.state = State::Generated(Deletion::Deleted); + self.state = State::Generated(Mutation::Deleted); } else { let num_rows = data_block.num_rows(); let data_block = DataBlock::filter_block(data_block, &filter)?; if data_block.num_rows() == num_rows { // none of the rows should be removed. - self.state = State::Generated(Deletion::DoNothing); + self.state = State::Generated(Mutation::DoNothing); } else if self.remain_reader.is_none() { let block = data_block.resort(self.output_schema.clone())?; self.state = State::NeedSerialize(block); @@ -290,7 +290,7 @@ impl Processor for DeletionSource { ); } State::Generated(op) => { - let meta = DeletionSourceMeta::create(self.index, op); + let meta = MutationSourceMeta::create(self.index, op); let new_part = self.ctx.try_get_part(); self.state = State::Output(new_part, DataBlock::empty_with_meta(meta)); } @@ -346,7 +346,7 @@ impl Processor for DeletionSource { &serialize_state.index_location, ) .await?; - self.state = State::Generated(Deletion::Replaced(block_meta)); + self.state = State::Generated(Mutation::Replaced(block_meta)); } _ => return Err(ErrorCode::Internal("It's a bug.")), } diff --git a/src/query/storages/fuse/src/operations/mutation/deletion/mod.rs b/src/query/storages/fuse/src/operations/mutation/deletion/mod.rs index d4efa10b1622..075f941b5fb4 100644 --- a/src/query/storages/fuse/src/operations/mutation/deletion/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/deletion/mod.rs @@ -12,10 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod deletion_meta; mod deletion_source; -mod deletion_transform; -pub use deletion_meta::Deletion; pub use deletion_source::DeletionSource; -pub use deletion_transform::DeletionTransform; diff --git a/src/query/storages/fuse/src/operations/mutation/mod.rs b/src/query/storages/fuse/src/operations/mutation/mod.rs index b51360c19a64..a5025cd5116d 100644 --- a/src/query/storages/fuse/src/operations/mutation/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/mod.rs @@ -19,6 +19,7 @@ mod deletion; pub mod mutation_meta; mod mutation_part; pub mod mutation_sink; +mod mutation_transform; pub mod recluster_mutator; mod update; @@ -32,9 +33,11 @@ pub use compact::SegmentCompactMutator; pub use compact::SegmentCompactionState; pub use compact::SegmentCompactor; pub use deletion::DeletionSource; -pub use deletion::DeletionTransform; -pub use mutation_meta::MutationMeta; +pub use mutation_meta::Mutation; +pub use mutation_meta::MutationSinkMeta; +pub use mutation_meta::MutationSourceMeta; pub use mutation_part::MutationPartInfo; pub use mutation_sink::MutationSink; +pub use mutation_transform::MutationTransform; pub use recluster_mutator::ReclusterMutator; pub use update::UpdateSource; diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs b/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs index ccce1acfd6b5..e64444330e46 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs @@ -13,25 +13,78 @@ // limitations under the License. use std::any::Any; +use std::sync::Arc; use common_datablocks::BlockMetaInfo; use common_datablocks::BlockMetaInfoPtr; use common_exception::ErrorCode; use common_exception::Result; +use common_storages_table_meta::meta::BlockMeta; use common_storages_table_meta::meta::Location; use common_storages_table_meta::meta::Statistics; use crate::operations::mutation::AbortOperation; +use crate::pruning::BlockIndex; + +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] +pub enum Mutation { + DoNothing, + Replaced(Arc), + Deleted, +} + +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)] +pub struct MutationSourceMeta { + pub index: BlockIndex, + pub op: Mutation, +} + +#[typetag::serde(name = "mutation_source_meta")] +impl BlockMetaInfo for MutationSourceMeta { + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn clone_self(&self) -> Box { + Box::new(self.clone()) + } + + fn equals(&self, info: &Box) -> bool { + match info.as_any().downcast_ref::() { + None => false, + Some(other) => self == other, + } + } +} + +impl MutationSourceMeta { + pub fn create(index: BlockIndex, op: Mutation) -> BlockMetaInfoPtr { + Box::new(MutationSourceMeta { index, op }) + } + + pub fn from_meta(info: &BlockMetaInfoPtr) -> Result<&MutationSourceMeta> { + match info.as_any().downcast_ref::() { + Some(part_ref) => Ok(part_ref), + None => Err(ErrorCode::Internal( + "Cannot downcast from BlockMetaInfo to MutationSourceMeta.", + )), + } + } +} #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)] -pub struct MutationMeta { +pub struct MutationSinkMeta { pub segments: Vec, pub summary: Statistics, pub abort_operation: AbortOperation, } -#[typetag::serde(name = "mutation_meta")] -impl BlockMetaInfo for MutationMeta { +#[typetag::serde(name = "mutation_sink_meta")] +impl BlockMetaInfo for MutationSinkMeta { fn as_any(&self) -> &dyn Any { self } @@ -45,31 +98,31 @@ impl BlockMetaInfo for MutationMeta { } fn equals(&self, info: &Box) -> bool { - match info.as_any().downcast_ref::() { + match info.as_any().downcast_ref::() { None => false, Some(other) => self == other, } } } -impl MutationMeta { +impl MutationSinkMeta { pub fn create( segments: Vec, summary: Statistics, abort_operation: AbortOperation, ) -> BlockMetaInfoPtr { - Box::new(MutationMeta { + Box::new(MutationSinkMeta { segments, summary, abort_operation, }) } - pub fn from_meta(info: &BlockMetaInfoPtr) -> Result<&MutationMeta> { - match info.as_any().downcast_ref::() { + pub fn from_meta(info: &BlockMetaInfoPtr) -> Result<&MutationSinkMeta> { + match info.as_any().downcast_ref::() { Some(part_ref) => Ok(part_ref), None => Err(ErrorCode::Internal( - "Cannot downcast from BlockMetaInfo to MutationMeta.", + "Cannot downcast from BlockMetaInfo to MutationSinkMeta.", )), } } diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_sink.rs b/src/query/storages/fuse/src/operations/mutation/mutation_sink.rs index 320f32a19f8b..70b97b3d52c9 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_sink.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_sink.rs @@ -37,7 +37,7 @@ use crate::metrics::metrics_inc_commit_mutation_unresolvable_conflict; use crate::operations::commit::Conflict; use crate::operations::commit::MutatorConflictDetector; use crate::operations::mutation::AbortOperation; -use crate::operations::mutation::MutationMeta; +use crate::operations::mutation::MutationSinkMeta; use crate::pipelines::processors::port::InputPort; use crate::pipelines::processors::processor::Event; use crate::pipelines::processors::processor::ProcessorPtr; @@ -158,7 +158,7 @@ impl Processor for MutationSink { fn process(&mut self) -> Result<()> { match std::mem::replace(&mut self.state, State::None) { State::ReadMeta(input_meta) => { - let meta = MutationMeta::from_meta(&input_meta)?; + let meta = MutationSinkMeta::from_meta(&input_meta)?; let affect_rows = self .base_snapshot diff --git a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_transform.rs b/src/query/storages/fuse/src/operations/mutation/mutation_transform.rs similarity index 93% rename from src/query/storages/fuse/src/operations/mutation/deletion/deletion_transform.rs rename to src/query/storages/fuse/src/operations/mutation/mutation_transform.rs index 7a79072b27e9..1efc7347fb01 100644 --- a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_transform.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_transform.rs @@ -33,10 +33,10 @@ use opendal::Operator; use crate::io::try_join_futures; use crate::io::SegmentsIO; use crate::io::TableMetaLocationGenerator; -use crate::operations::mutation::deletion::deletion_meta::DeletionSourceMeta; -use crate::operations::mutation::deletion::Deletion; use crate::operations::mutation::AbortOperation; -use crate::operations::mutation::MutationMeta; +use crate::operations::mutation::Mutation; +use crate::operations::mutation::MutationSinkMeta; +use crate::operations::mutation::MutationSourceMeta; use crate::pipelines::processors::port::InputPort; use crate::pipelines::processors::port::OutputPort; use crate::pipelines::processors::processor::Event; @@ -45,7 +45,7 @@ use crate::pipelines::processors::Processor; use crate::statistics::reducers::merge_statistics_mut; use crate::statistics::reducers::reduce_block_metas; -type DeletionMap = HashMap)>, Vec)>; +type MutationMap = HashMap)>, Vec)>; struct SerializedData { data: Vec, @@ -69,7 +69,7 @@ enum State { }, } -pub struct DeletionTransform { +pub struct MutationTransform { state: State, ctx: Arc, dal: Operator, @@ -80,13 +80,13 @@ pub struct DeletionTransform { abort_operation: AbortOperation, inputs: Vec>, - input_metas: DeletionMap, + input_metas: MutationMap, cur_input_index: usize, output: Arc, output_data: Option, } -impl DeletionTransform { +impl MutationTransform { pub fn try_create( ctx: Arc, inputs: Vec>, @@ -96,7 +96,7 @@ impl DeletionTransform { base_segments: Vec, thresholds: BlockCompactThresholds, ) -> Result { - Ok(ProcessorPtr::create(Box::new(DeletionTransform { + Ok(ProcessorPtr::create(Box::new(MutationTransform { state: State::None, ctx, dal, @@ -160,7 +160,7 @@ impl DeletionTransform { try_join_futures( self.ctx.clone(), handles, - "deletion-write-segments-worker".to_owned(), + "mutation-write-segments-worker".to_owned(), ) .await? .into_iter() @@ -170,9 +170,9 @@ impl DeletionTransform { } #[async_trait::async_trait] -impl Processor for DeletionTransform { +impl Processor for MutationTransform { fn name(&self) -> String { - "DeletionTransform".to_string() + "MutationTransform".to_string() } fn as_any(&mut self) -> &mut dyn Any { @@ -228,22 +228,22 @@ impl Processor for DeletionTransform { .get_meta() .cloned() .ok_or_else(|| ErrorCode::Internal("No block meta. It's a bug"))?; - let meta = DeletionSourceMeta::from_meta(&input_meta)?; + let meta = MutationSourceMeta::from_meta(&input_meta)?; match &meta.op { - Deletion::Replaced(block_meta) => { + Mutation::Replaced(block_meta) => { self.input_metas .entry(meta.index.0) .and_modify(|v| v.0.push((meta.index.1, block_meta.clone()))) .or_insert((vec![(meta.index.1, block_meta.clone())], vec![])); self.abort_operation.add_block(block_meta); } - Deletion::Deleted => { + Mutation::Deleted => { self.input_metas .entry(meta.index.0) .and_modify(|v| v.1.push(meta.index.1)) .or_insert((vec![], vec![meta.index.1])); } - Deletion::DoNothing => (), + Mutation::DoNothing => (), } } State::GenerateSegments(segment_infos) => { @@ -305,7 +305,7 @@ impl Processor for DeletionTransform { }; } State::Output { segments, summary } => { - let meta = MutationMeta::create( + let meta = MutationSinkMeta::create( segments, summary, std::mem::take(&mut self.abort_operation), diff --git a/src/query/storages/fuse/src/operations/mutation/update/mod.rs b/src/query/storages/fuse/src/operations/mutation/update/mod.rs index d79f2cb34494..05b474dc0459 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/mod.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod update_meta; mod update_source; pub use update_source::UpdateSource; diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_meta.rs b/src/query/storages/fuse/src/operations/mutation/update/update_meta.rs deleted file mode 100644 index a40db4c80703..000000000000 --- a/src/query/storages/fuse/src/operations/mutation/update/update_meta.rs +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::sync::Arc; - -use common_datablocks::BlockMetaInfo; -use common_datablocks::BlockMetaInfoPtr; -use common_exception::ErrorCode; -use common_exception::Result; -use common_storages_table_meta::meta::BlockMeta; - -use crate::pruning::BlockIndex; - -#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)] -pub struct UpdateSourceMeta { - pub index: BlockIndex, - pub replace: Arc, -} - -#[typetag::serde(name = "update_source_meta")] -impl BlockMetaInfo for UpdateSourceMeta { - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn clone_self(&self) -> Box { - Box::new(self.clone()) - } - - fn equals(&self, info: &Box) -> bool { - match info.as_any().downcast_ref::() { - None => false, - Some(other) => self == other, - } - } -} - -impl UpdateSourceMeta { - pub fn create(index: BlockIndex, replace: Arc) -> BlockMetaInfoPtr { - Box::new(UpdateSourceMeta { index, replace }) - } - - pub fn from_meta(info: &BlockMetaInfoPtr) -> Result<&UpdateSourceMeta> { - match info.as_any().downcast_ref::() { - Some(part_ref) => Ok(part_ref), - None => Err(ErrorCode::Internal( - "Cannot downcast from BlockMetaInfo to UpdateSourceMeta.", - )), - } - } -} diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs index e0e696bc87bb..f2835e813ff0 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs @@ -29,11 +29,12 @@ use common_storages_table_meta::meta::BlockMeta; use common_storages_table_meta::table::TableCompression; use opendal::Operator; -use super::update_meta::UpdateSourceMeta; use crate::io::write_data; use crate::io::BlockReader; use crate::io::TableMetaLocationGenerator; +use crate::operations::mutation::Mutation; use crate::operations::mutation::MutationPartInfo; +use crate::operations::mutation::MutationSourceMeta; use crate::operations::util; use crate::operations::BloomIndexState; use crate::pipelines::processors::port::OutputPort; @@ -64,7 +65,7 @@ enum State { UpdateData(DataBlock), NeedSerialize(DataBlock), Serialized(SerializeState, Arc), - Generated(Arc), + Generated(Mutation), Output(Option, DataBlock), Finish, } @@ -184,8 +185,7 @@ impl Processor for UpdateSource { self.state = State::ReadRemain(part, data_block); } } else { - let new_part = self.ctx.try_get_part(); - self.state = State::Output(new_part, DataBlock::empty()); + self.state = State::Generated(Mutation::DoNothing); } } else { self.state = State::UpdateData(data_block); @@ -267,8 +267,8 @@ impl Processor for UpdateSource { new_meta, ); } - State::Generated(replace) => { - let meta = UpdateSourceMeta::create(self.index, replace); + State::Generated(op) => { + let meta = MutationSourceMeta::create(self.index, op); let new_part = self.ctx.try_get_part(); self.state = State::Output(new_part, DataBlock::empty_with_meta(meta)); } @@ -318,7 +318,7 @@ impl Processor for UpdateSource { &serialize_state.index_location, ) .await?; - self.state = State::Generated(block_meta); + self.state = State::Generated(Mutation::Replaced(block_meta)); } _ => return Err(ErrorCode::Internal("It's a bug.")), } diff --git a/src/query/storages/fuse/src/operations/update.rs b/src/query/storages/fuse/src/operations/update.rs index bffd59e0cee7..7fe4f9345e4b 100644 --- a/src/query/storages/fuse/src/operations/update.rs +++ b/src/query/storages/fuse/src/operations/update.rs @@ -28,6 +28,7 @@ use common_exception::Result; use common_sql::evaluator::ChunkOperator; use common_sql::evaluator::Evaluator; +use super::mutation::MutationSink; use crate::operations::mutation::MutationPartInfo; use crate::operations::mutation::UpdateSource; use crate::pipelines::Pipeline; @@ -149,7 +150,7 @@ impl FuseTable { } (Projection::Columns(col_indices.clone()), vec![ - filter.unwrap().clone(), + filter.unwrap(), ]) }; @@ -228,6 +229,11 @@ impl FuseTable { }, max_threads, )?; + + self.try_add_mutation_transform(ctx.clone(), snapshot.segments.clone(), pipeline)?; + pipeline.add_sink(|input| { + MutationSink::try_create(self, ctx.clone(), snapshot.clone(), input) + })?; Ok(()) } } From 518fa7485f05e049f7cb902ac07b7289301904cb Mon Sep 17 00:00:00 2001 From: zhyass Date: Thu, 29 Dec 2022 23:15:27 +0800 Subject: [PATCH 07/26] add interpreter_update --- src/query/catalog/src/table.rs | 18 ++++++++ .../src/interpreters/interpreter_delete.rs | 4 +- .../src/interpreters/interpreter_factory.rs | 9 ++-- .../src/interpreters/interpreter_update.rs | 43 +++++++++++++++++++ src/query/service/src/interpreters/mod.rs | 1 + src/query/sql/src/planner/plans/update.rs | 9 ++++ src/query/storages/fuse/src/fuse_table.rs | 12 ++++++ .../storages/fuse/src/operations/update.rs | 2 +- 8 files changed, 91 insertions(+), 7 deletions(-) diff --git a/src/query/catalog/src/table.rs b/src/query/catalog/src/table.rs index 77e605f1e62d..7cd709b2dad7 100644 --- a/src/query/catalog/src/table.rs +++ b/src/query/catalog/src/table.rs @@ -14,6 +14,7 @@ use std::any::Any; use std::collections::BTreeMap; +use std::collections::HashMap; use std::sync::Arc; use chrono::DateTime; @@ -240,6 +241,23 @@ pub trait Table: Sync + Send { ))) } + async fn update( + &self, + ctx: Arc, + filter: Option, + col_indices: Vec, + update_list: HashMap, + pipeline: &mut Pipeline, + ) -> Result<()> { + let (_, _, _, _, _) = (ctx, filter, col_indices, update_list, pipeline); + + Err(ErrorCode::Unimplemented(format!( + "table {}, of engine type {}, does not support UPDATE", + self.name(), + self.get_table_info().engine(), + ))) + } + fn get_block_compact_thresholds(&self) -> BlockCompactThresholds { BlockCompactThresholds { max_rows_per_block: 1000 * 1000, diff --git a/src/query/service/src/interpreters/interpreter_delete.rs b/src/query/service/src/interpreters/interpreter_delete.rs index e565c9336fe3..63e59bc8f0be 100644 --- a/src/query/service/src/interpreters/interpreter_delete.rs +++ b/src/query/service/src/interpreters/interpreter_delete.rs @@ -35,7 +35,7 @@ pub struct DeleteInterpreter { } impl DeleteInterpreter { - /// Create the DelectInterpreter from DelectPlan + /// Create the DeleteInterpreter from DeletePlan pub fn try_create(ctx: Arc, plan: DeletePlan) -> Result { Ok(DeleteInterpreter { ctx, plan }) } @@ -48,7 +48,7 @@ impl Interpreter for DeleteInterpreter { "DeleteInterpreter" } - /// Get the schema of SelectPlan + /// Get the schema of DeletePlan fn schema(&self) -> DataSchemaRef { self.plan.schema() } diff --git a/src/query/service/src/interpreters/interpreter_factory.rs b/src/query/service/src/interpreters/interpreter_factory.rs index 942c1b148706..302322bfd5eb 100644 --- a/src/query/service/src/interpreters/interpreter_factory.rs +++ b/src/query/service/src/interpreters/interpreter_factory.rs @@ -15,7 +15,6 @@ use std::sync::Arc; use common_ast::ast::ExplainKind; -use common_exception::ErrorCode; use common_exception::Result; use tracing::error; @@ -35,6 +34,7 @@ use crate::interpreters::CreateShareInterpreter; use crate::interpreters::DropShareInterpreter; use crate::interpreters::DropUserInterpreter; use crate::interpreters::SetRoleInterpreter; +use crate::interpreters::UpdateInterpreter; use crate::sessions::QueryContext; use crate::sql::plans::Plan; @@ -200,9 +200,10 @@ impl InterpreterFactory { *delete.clone(), )?)), - Plan::Update(_update) => Err(ErrorCode::Unimplemented( - "Unimplement for update".to_string(), - )), + Plan::Update(update) => Ok(Arc::new(UpdateInterpreter::try_create( + ctx, + *update.clone(), + )?)), // Roles Plan::CreateRole(create_role) => Ok(Arc::new(CreateRoleInterpreter::try_create( diff --git a/src/query/service/src/interpreters/interpreter_update.rs b/src/query/service/src/interpreters/interpreter_update.rs index 598edffcf3f6..804aabdc26e1 100644 --- a/src/query/service/src/interpreters/interpreter_update.rs +++ b/src/query/service/src/interpreters/interpreter_update.rs @@ -11,3 +11,46 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +use std::sync::Arc; + +use common_datavalues::DataSchemaRef; +use common_exception::Result; +use common_sql::plans::UpdatePlan; + +use crate::interpreters::Interpreter; +use crate::pipelines::PipelineBuildResult; +use crate::sessions::QueryContext; +use crate::sessions::TableContext; + +/// interprets UpdatePlan +pub struct UpdateInterpreter { + ctx: Arc, + plan: UpdatePlan, +} + +impl UpdateInterpreter { + /// Create the UpdateInterpreter from UpdatePlan + pub fn try_create(ctx: Arc, plan: UpdatePlan) -> Result { + Ok(UpdateInterpreter { ctx, plan }) + } +} + +#[async_trait::async_trait] +impl Interpreter for UpdateInterpreter { + /// Get the name of current interpreter + fn name(&self) -> &str { + "UpdateInterpreter" + } + + /// Get the schema of UpdatePlan + fn schema(&self) -> DataSchemaRef { + self.plan.schema() + } + + #[tracing::instrument(level = "debug", name = "update_interpreter_execute", skip(self), fields(ctx.id = self.ctx.get_id().as_str()))] + async fn execute2(&self) -> Result { + // TODO check privilege + todo!() + } +} diff --git a/src/query/service/src/interpreters/mod.rs b/src/query/service/src/interpreters/mod.rs index 1f7ecd705f91..f484bb563d16 100644 --- a/src/query/service/src/interpreters/mod.rs +++ b/src/query/service/src/interpreters/mod.rs @@ -136,6 +136,7 @@ pub use interpreter_table_show_create::ShowCreateTableInterpreter; pub use interpreter_table_truncate::TruncateTableInterpreter; pub use interpreter_table_undrop::UndropTableInterpreter; pub use interpreter_unsetting::UnSettingInterpreter; +pub use interpreter_update::UpdateInterpreter; pub use interpreter_use_database::UseDatabaseInterpreter; pub use interpreter_user_alter::AlterUserInterpreter; pub use interpreter_user_create::CreateUserInterpreter; diff --git a/src/query/sql/src/planner/plans/update.rs b/src/query/sql/src/planner/plans/update.rs index 768c9ed12c97..cd541b282fbb 100644 --- a/src/query/sql/src/planner/plans/update.rs +++ b/src/query/sql/src/planner/plans/update.rs @@ -13,7 +13,10 @@ // limitations under the License. use std::collections::HashMap; +use std::sync::Arc; +use common_datavalues::DataSchema; +use common_datavalues::DataSchemaRef; use common_meta_types::MetaId; use crate::plans::Scalar; @@ -27,3 +30,9 @@ pub struct UpdatePlan { pub update_list: HashMap, pub selection: Option, } + +impl UpdatePlan { + pub fn schema(&self) -> DataSchemaRef { + Arc::new(DataSchema::empty()) + } +} diff --git a/src/query/storages/fuse/src/fuse_table.rs b/src/query/storages/fuse/src/fuse_table.rs index 3867f96d998e..b3408695b198 100644 --- a/src/query/storages/fuse/src/fuse_table.rs +++ b/src/query/storages/fuse/src/fuse_table.rs @@ -513,6 +513,18 @@ impl Table for FuseTable { self.do_delete(ctx, filter, col_indices, pipeline).await } + async fn update( + &self, + ctx: Arc, + filter: Option, + col_indices: Vec, + update_list: HashMap, + pipeline: &mut Pipeline, + ) -> Result<()> { + self.do_update(ctx, filter, col_indices, update_list, pipeline) + .await + } + fn get_block_compact_thresholds(&self) -> BlockCompactThresholds { let max_rows_per_block = self.get_option(FUSE_OPT_KEY_ROW_PER_BLOCK, DEFAULT_ROW_PER_BLOCK); let min_rows_per_block = (max_rows_per_block as f64 * 0.8) as usize; diff --git a/src/query/storages/fuse/src/operations/update.rs b/src/query/storages/fuse/src/operations/update.rs index 7fe4f9345e4b..7ddbb1c63a06 100644 --- a/src/query/storages/fuse/src/operations/update.rs +++ b/src/query/storages/fuse/src/operations/update.rs @@ -28,8 +28,8 @@ use common_exception::Result; use common_sql::evaluator::ChunkOperator; use common_sql::evaluator::Evaluator; -use super::mutation::MutationSink; use crate::operations::mutation::MutationPartInfo; +use crate::operations::mutation::MutationSink; use crate::operations::mutation::UpdateSource; use crate::pipelines::Pipeline; use crate::pruning::BlockPruner; From aba9f907aaa5f07cfd77c3d060b4d17d98685e03 Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 30 Dec 2022 17:32:23 +0800 Subject: [PATCH 08/26] enable update --- src/query/catalog/src/table.rs | 3 +- .../src/interpreters/interpreter_delete.rs | 5 +- .../src/interpreters/interpreter_update.rs | 55 ++++++++++++++++++- src/query/sql/src/planner/binder/update.rs | 1 + src/query/sql/src/planner/plans/update.rs | 2 + src/query/storages/fuse/src/fuse_table.rs | 2 +- .../storages/fuse/src/operations/delete.rs | 2 +- .../storages/fuse/src/operations/update.rs | 5 +- 8 files changed, 64 insertions(+), 11 deletions(-) diff --git a/src/query/catalog/src/table.rs b/src/query/catalog/src/table.rs index 7cd709b2dad7..4138e7ad96ac 100644 --- a/src/query/catalog/src/table.rs +++ b/src/query/catalog/src/table.rs @@ -14,7 +14,6 @@ use std::any::Any; use std::collections::BTreeMap; -use std::collections::HashMap; use std::sync::Arc; use chrono::DateTime; @@ -246,7 +245,7 @@ pub trait Table: Sync + Send { ctx: Arc, filter: Option, col_indices: Vec, - update_list: HashMap, + update_list: Vec<(usize, Expression)>, pipeline: &mut Pipeline, ) -> Result<()> { let (_, _, _, _, _) = (ctx, filter, col_indices, update_list, pipeline); diff --git a/src/query/service/src/interpreters/interpreter_delete.rs b/src/query/service/src/interpreters/interpreter_delete.rs index 63e59bc8f0be..8a016a4ffc15 100644 --- a/src/query/service/src/interpreters/interpreter_delete.rs +++ b/src/query/service/src/interpreters/interpreter_delete.rs @@ -17,8 +17,6 @@ use std::sync::Arc; use common_datavalues::DataSchemaRef; use common_exception::Result; use common_pipeline_core::Pipeline; -use common_sql::executor::ExpressionBuilderWithoutRenaming; -use common_sql::plans::DeletePlan; use crate::interpreters::Interpreter; use crate::pipelines::executor::ExecutorSettings; @@ -26,6 +24,8 @@ use crate::pipelines::executor::PipelineCompleteExecutor; use crate::pipelines::PipelineBuildResult; use crate::sessions::QueryContext; use crate::sessions::TableContext; +use crate::sql::executor::ExpressionBuilderWithoutRenaming; +use crate::sql::plans::DeletePlan; use crate::sql::plans::ScalarExpr; /// interprets DeletePlan @@ -60,6 +60,7 @@ impl Interpreter for DeleteInterpreter { let db_name = self.plan.database_name.as_str(); let tbl_name = self.plan.table_name.as_str(); let tbl = self.ctx.get_table(catalog_name, db_name, tbl_name).await?; + let (filter, col_indices) = if let Some(scalar) = &self.plan.selection { let eb = ExpressionBuilderWithoutRenaming::create(self.plan.metadata.clone()); ( diff --git a/src/query/service/src/interpreters/interpreter_update.rs b/src/query/service/src/interpreters/interpreter_update.rs index 804aabdc26e1..c64225c3102d 100644 --- a/src/query/service/src/interpreters/interpreter_update.rs +++ b/src/query/service/src/interpreters/interpreter_update.rs @@ -15,13 +15,19 @@ use std::sync::Arc; use common_datavalues::DataSchemaRef; +use common_exception::ErrorCode; use common_exception::Result; -use common_sql::plans::UpdatePlan; +use common_pipeline_core::Pipeline; use crate::interpreters::Interpreter; +use crate::pipelines::executor::ExecutorSettings; +use crate::pipelines::executor::PipelineCompleteExecutor; use crate::pipelines::PipelineBuildResult; use crate::sessions::QueryContext; use crate::sessions::TableContext; +use crate::sql::executor::ExpressionBuilderWithoutRenaming; +use crate::sql::plans::ScalarExpr; +use crate::sql::plans::UpdatePlan; /// interprets UpdatePlan pub struct UpdateInterpreter { @@ -51,6 +57,51 @@ impl Interpreter for UpdateInterpreter { #[tracing::instrument(level = "debug", name = "update_interpreter_execute", skip(self), fields(ctx.id = self.ctx.get_id().as_str()))] async fn execute2(&self) -> Result { // TODO check privilege - todo!() + let catalog_name = self.plan.catalog.as_str(); + let db_name = self.plan.database.as_str(); + let tbl_name = self.plan.table.as_str(); + let tbl = self.ctx.get_table(catalog_name, db_name, tbl_name).await?; + + let eb = ExpressionBuilderWithoutRenaming::create(self.plan.metadata.clone()); + let (filter, col_indices) = if let Some(scalar) = &self.plan.selection { + ( + Some(eb.build(scalar)?), + scalar.used_columns().into_iter().collect(), + ) + } else { + (None, vec![]) + }; + + let update_list = self.plan.update_list.iter().try_fold( + Vec::with_capacity(self.plan.update_list.len()), + |mut acc, (id, scalar)| { + let expr = eb.build(scalar)?; + acc.push((*id, expr)); + Ok::<_, ErrorCode>(acc) + }, + )?; + + let mut pipeline = Pipeline::create(); + tbl.update( + self.ctx.clone(), + filter, + col_indices, + update_list, + &mut pipeline, + ) + .await?; + if !pipeline.pipes.is_empty() { + let settings = self.ctx.get_settings(); + pipeline.set_max_threads(settings.get_max_threads()? as usize); + let query_id = self.ctx.get_id(); + let executor_settings = ExecutorSettings::try_create(&settings, query_id)?; + let executor = PipelineCompleteExecutor::try_create(pipeline, executor_settings)?; + + self.ctx.set_executor(Arc::downgrade(&executor.get_inner())); + executor.execute()?; + drop(executor); + } + + Ok(PipelineBuildResult::create()) } } diff --git a/src/query/sql/src/planner/binder/update.rs b/src/query/sql/src/planner/binder/update.rs index 1965d729d32c..77dcd39abf3d 100644 --- a/src/query/sql/src/planner/binder/update.rs +++ b/src/query/sql/src/planner/binder/update.rs @@ -104,6 +104,7 @@ impl<'a> Binder { database: database_name, table: table_name, table_id, + metadata: self.metadata.clone(), update_list: update_columns, selection: push_downs, }; diff --git a/src/query/sql/src/planner/plans/update.rs b/src/query/sql/src/planner/plans/update.rs index cd541b282fbb..9f7fc58cff23 100644 --- a/src/query/sql/src/planner/plans/update.rs +++ b/src/query/sql/src/planner/plans/update.rs @@ -20,6 +20,7 @@ use common_datavalues::DataSchemaRef; use common_meta_types::MetaId; use crate::plans::Scalar; +use crate::MetadataRef; #[derive(Clone, Debug)] pub struct UpdatePlan { @@ -27,6 +28,7 @@ pub struct UpdatePlan { pub database: String, pub table: String, pub table_id: MetaId, + pub metadata: MetadataRef, pub update_list: HashMap, pub selection: Option, } diff --git a/src/query/storages/fuse/src/fuse_table.rs b/src/query/storages/fuse/src/fuse_table.rs index b3408695b198..c3088488ff22 100644 --- a/src/query/storages/fuse/src/fuse_table.rs +++ b/src/query/storages/fuse/src/fuse_table.rs @@ -518,7 +518,7 @@ impl Table for FuseTable { ctx: Arc, filter: Option, col_indices: Vec, - update_list: HashMap, + update_list: Vec<(usize, Expression)>, pipeline: &mut Pipeline, ) -> Result<()> { self.do_update(ctx, filter, col_indices, update_list, pipeline) diff --git a/src/query/storages/fuse/src/operations/delete.rs b/src/query/storages/fuse/src/operations/delete.rs index 16c18a76d03a..8d4041dac2a3 100644 --- a/src/query/storages/fuse/src/operations/delete.rs +++ b/src/query/storages/fuse/src/operations/delete.rs @@ -176,7 +176,7 @@ impl FuseTable { let (_, inner_parts) = self.read_partitions_with_metas( ctx.clone(), self.table_info.schema(), - push_down, + None, metas, base_snapshot.summary.block_count as usize, )?; diff --git a/src/query/storages/fuse/src/operations/update.rs b/src/query/storages/fuse/src/operations/update.rs index 7ddbb1c63a06..d157e58a3a83 100644 --- a/src/query/storages/fuse/src/operations/update.rs +++ b/src/query/storages/fuse/src/operations/update.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::collections::BTreeMap; -use std::collections::HashMap; use std::sync::Arc; use common_catalog::plan::Expression; @@ -41,7 +40,7 @@ impl FuseTable { ctx: Arc, filter: Option, col_indices: Vec, - update_list: HashMap, + update_list: Vec<(usize, Expression)>, pipeline: &mut Pipeline, ) -> Result<()> { let snapshot_opt = self.read_table_snapshot().await?; @@ -198,7 +197,7 @@ impl FuseTable { let (_, inner_parts) = self.read_partitions_with_metas( ctx.clone(), self.table_info.schema(), - push_down, + None, metas, snapshot.summary.block_count as usize, )?; From 5e17e8c06d33a8111097fb73c3ee7713f6849822 Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 30 Dec 2022 18:09:46 +0800 Subject: [PATCH 09/26] fix test case --- src/query/service/src/interpreters/interpreter_update.rs | 1 + .../suites/duckdb/issues/monetdb/test_correlated_update.test | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/query/service/src/interpreters/interpreter_update.rs b/src/query/service/src/interpreters/interpreter_update.rs index c64225c3102d..96eb1dd5525f 100644 --- a/src/query/service/src/interpreters/interpreter_update.rs +++ b/src/query/service/src/interpreters/interpreter_update.rs @@ -63,6 +63,7 @@ impl Interpreter for UpdateInterpreter { let tbl = self.ctx.get_table(catalog_name, db_name, tbl_name).await?; let eb = ExpressionBuilderWithoutRenaming::create(self.plan.metadata.clone()); + // TODO: selection and update_list support subquery. let (filter, col_indices) = if let Some(scalar) = &self.plan.selection { ( Some(eb.build(scalar)?), diff --git a/tests/sqllogictests/suites/duckdb/issues/monetdb/test_correlated_update.test b/tests/sqllogictests/suites/duckdb/issues/monetdb/test_correlated_update.test index 0e7c704c3cc3..b160bd0673db 100644 --- a/tests/sqllogictests/suites/duckdb/issues/monetdb/test_correlated_update.test +++ b/tests/sqllogictests/suites/duckdb/issues/monetdb/test_correlated_update.test @@ -16,7 +16,7 @@ insert into t1284791a values (1,'1') statement ok insert into t1284791b values (1,'2') -statement error 1002 +statement error 1001 update t1284791a set val1 = (select val2 from t1284791b where id1 = id2) where id1 in (select id2 from t1284791b) query IT From b1a4d217127476df70ee38b6fdff839a452c8d64 Mon Sep 17 00:00:00 2001 From: zhyass Date: Tue, 3 Jan 2023 18:52:58 +0800 Subject: [PATCH 10/26] Add unit test --- .../src/interpreters/interpreter_update.rs | 2 +- .../{deletion_mutator.rs => deletion.rs} | 0 .../storages/fuse/operations/mutation/mod.rs | 5 +- .../fuse/operations/mutation/update.rs | 124 ++++++++++++++++++ .../mutation/deletion/deletion_source.rs | 11 +- .../fuse/src/operations/mutation/mod.rs | 3 + .../mutation/update/update_source.rs | 62 +++++---- .../fuse/src/operations/mutation/util.rs | 22 ++++ .../storages/fuse/src/operations/update.rs | 26 +++- 9 files changed, 212 insertions(+), 43 deletions(-) rename src/query/service/tests/it/storages/fuse/operations/mutation/{deletion_mutator.rs => deletion.rs} (100%) create mode 100644 src/query/service/tests/it/storages/fuse/operations/mutation/update.rs create mode 100644 src/query/storages/fuse/src/operations/mutation/util.rs diff --git a/src/query/service/src/interpreters/interpreter_update.rs b/src/query/service/src/interpreters/interpreter_update.rs index 96eb1dd5525f..f23ea07a0489 100644 --- a/src/query/service/src/interpreters/interpreter_update.rs +++ b/src/query/service/src/interpreters/interpreter_update.rs @@ -63,7 +63,7 @@ impl Interpreter for UpdateInterpreter { let tbl = self.ctx.get_table(catalog_name, db_name, tbl_name).await?; let eb = ExpressionBuilderWithoutRenaming::create(self.plan.metadata.clone()); - // TODO: selection and update_list support subquery. + // TODO(zhyass): selection and update_list support subquery. let (filter, col_indices) = if let Some(scalar) = &self.plan.selection { ( Some(eb.build(scalar)?), diff --git a/src/query/service/tests/it/storages/fuse/operations/mutation/deletion_mutator.rs b/src/query/service/tests/it/storages/fuse/operations/mutation/deletion.rs similarity index 100% rename from src/query/service/tests/it/storages/fuse/operations/mutation/deletion_mutator.rs rename to src/query/service/tests/it/storages/fuse/operations/mutation/deletion.rs diff --git a/src/query/service/tests/it/storages/fuse/operations/mutation/mod.rs b/src/query/service/tests/it/storages/fuse/operations/mutation/mod.rs index 634e6cc25d6b..ec24f4309b8b 100644 --- a/src/query/service/tests/it/storages/fuse/operations/mutation/mod.rs +++ b/src/query/service/tests/it/storages/fuse/operations/mutation/mod.rs @@ -13,8 +13,9 @@ // limitations under the License. mod block_compact_mutator; -mod deletion_mutator; +mod deletion; mod recluster_mutator; mod segments_compact_mutator; +mod update; -pub use deletion_mutator::do_deletion; +pub use deletion::do_deletion; diff --git a/src/query/service/tests/it/storages/fuse/operations/mutation/update.rs b/src/query/service/tests/it/storages/fuse/operations/mutation/update.rs new file mode 100644 index 000000000000..5ae69eb698e2 --- /dev/null +++ b/src/query/service/tests/it/storages/fuse/operations/mutation/update.rs @@ -0,0 +1,124 @@ +// Copyright 2023 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use common_base::base::tokio; +use common_exception::ErrorCode; +use common_exception::Result; +use common_sql::executor::ExpressionBuilderWithoutRenaming; +use common_sql::plans::Plan; +use common_sql::plans::ScalarExpr; +use common_sql::plans::UpdatePlan; +use common_sql::Planner; +use common_storages_factory::Table; +use common_storages_fuse::FuseTable; +use databend_query::pipelines::executor::ExecutorSettings; +use databend_query::pipelines::executor::PipelineCompleteExecutor; +use databend_query::sessions::QueryContext; +use databend_query::sessions::TableContext; + +use crate::storages::fuse::table_test_fixture::execute_command; +use crate::storages::fuse::table_test_fixture::execute_query; +use crate::storages::fuse::table_test_fixture::expects_ok; +use crate::storages::fuse::table_test_fixture::TestFixture; + +#[tokio::test(flavor = "multi_thread")] +async fn test_update_mutator_multiple_empty_segments() -> Result<()> { + let fixture = TestFixture::new().await; + let ctx = fixture.ctx(); + let tbl_name = fixture.default_table_name(); + let db_name = fixture.default_db_name(); + + fixture.create_normal_table().await?; + + // insert + for i in 0..10 { + let qry = format!("insert into {}.{}(id) values({})", db_name, tbl_name, i); + execute_command(ctx.clone(), qry.as_str()).await?; + } + + let catalog = ctx.get_catalog(fixture.default_catalog_name().as_str())?; + let table = catalog + .get_table(ctx.get_tenant().as_str(), &db_name, &tbl_name) + .await?; + // update + let query = format!("update {}.{} set id=0 where id>0", db_name, tbl_name); + let mut planner = Planner::new(ctx.clone()); + let (plan, _, _) = planner.plan_sql(&query).await?; + if let Plan::Update(update) = plan { + do_update(ctx.clone(), table.clone(), *update).await?; + } + + // check count + let expected = vec![ + "+-------+", + "| count |", + "+-------+", + "| 10 |", + "+-------+", + ]; + let qry = format!( + "select count(1) as count from {}.{} where id=0", + db_name, tbl_name + ); + expects_ok( + "check count", + execute_query(fixture.ctx(), qry.as_str()).await, + expected, + ) + .await?; + Ok(()) +} + +pub async fn do_update( + ctx: Arc, + table: Arc, + plan: UpdatePlan, +) -> Result<()> { + let eb = ExpressionBuilderWithoutRenaming::create(plan.metadata.clone()); + let (filter, col_indices) = if let Some(scalar) = &plan.selection { + ( + Some(eb.build(scalar)?), + scalar.used_columns().into_iter().collect(), + ) + } else { + (None, vec![]) + }; + let update_list = plan.update_list.iter().try_fold( + Vec::with_capacity(plan.update_list.len()), + |mut acc, (id, scalar)| { + let expr = eb.build(scalar)?; + acc.push((*id, expr)); + Ok::<_, ErrorCode>(acc) + }, + )?; + + let fuse_table = FuseTable::try_from_table(table.as_ref())?; + let settings = ctx.get_settings(); + let mut pipeline = common_pipeline_core::Pipeline::create(); + fuse_table + .update(ctx.clone(), filter, col_indices, update_list, &mut pipeline) + .await?; + if !pipeline.pipes.is_empty() { + pipeline.set_max_threads(settings.get_max_threads()? as usize); + let query_id = ctx.get_id(); + let executor_settings = ExecutorSettings::try_create(&settings, query_id)?; + let executor = PipelineCompleteExecutor::try_create(pipeline, executor_settings)?; + ctx.set_executor(Arc::downgrade(&executor.get_inner())); + executor.execute()?; + drop(executor); + } + Ok(()) +} diff --git a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs b/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs index 18807c5cf09e..c4e2e67e05df 100644 --- a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs @@ -35,9 +35,11 @@ use opendal::Operator; use crate::io::write_data; use crate::io::BlockReader; use crate::io::TableMetaLocationGenerator; +use crate::operations::mutation::DataChunks; use crate::operations::mutation::Mutation; use crate::operations::mutation::MutationPartInfo; use crate::operations::mutation::MutationSourceMeta; +use crate::operations::mutation::SerializeState; use crate::operations::util; use crate::operations::BloomIndexState; use crate::pipelines::processors::port::OutputPort; @@ -50,15 +52,6 @@ use crate::statistics::ClusterStatsGenerator; use crate::FuseTable; use crate::Table; -type DataChunks = Vec<(usize, Vec)>; - -struct SerializeState { - block_data: Vec, - block_location: String, - index_data: Vec, - index_location: String, -} - enum State { ReadData(Option), FilterData(PartInfoPtr, DataChunks), diff --git a/src/query/storages/fuse/src/operations/mutation/mod.rs b/src/query/storages/fuse/src/operations/mutation/mod.rs index a5025cd5116d..a672d2ffda59 100644 --- a/src/query/storages/fuse/src/operations/mutation/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/mod.rs @@ -22,6 +22,7 @@ pub mod mutation_sink; mod mutation_transform; pub mod recluster_mutator; mod update; +mod util; pub use abort_operation::AbortOperation; pub use base_mutator::BaseMutator; @@ -41,3 +42,5 @@ pub use mutation_sink::MutationSink; pub use mutation_transform::MutationTransform; pub use recluster_mutator::ReclusterMutator; pub use update::UpdateSource; +pub use util::DataChunks; +pub use util::SerializeState; diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs index f2835e813ff0..05c73ac14bd7 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use common_catalog::plan::PartInfoPtr; use common_catalog::table_context::TableContext; use common_datablocks::DataBlock; +use common_datavalues::ColumnRef; use common_datavalues::DataField; use common_datavalues::ToDataType; use common_exception::ErrorCode; @@ -32,9 +33,11 @@ use opendal::Operator; use crate::io::write_data; use crate::io::BlockReader; use crate::io::TableMetaLocationGenerator; +use crate::operations::mutation::DataChunks; use crate::operations::mutation::Mutation; use crate::operations::mutation::MutationPartInfo; use crate::operations::mutation::MutationSourceMeta; +use crate::operations::mutation::SerializeState; use crate::operations::util; use crate::operations::BloomIndexState; use crate::pipelines::processors::port::OutputPort; @@ -45,22 +48,19 @@ use crate::pruning::BlockIndex; use crate::statistics::gen_columns_statistics; use crate::FuseTable; -type DataChunks = Vec<(usize, Vec)>; -struct SerializeState { - block_data: Vec, - block_location: String, - index_data: Vec, - index_location: String, -} - enum State { ReadData(Option), FilterData(PartInfoPtr, DataChunks), - ReadRemain(PartInfoPtr, DataBlock), + ReadRemain { + part: PartInfoPtr, + data_block: DataBlock, + filter: ColumnRef, + }, MergeRemain { part: PartInfoPtr, chunks: DataChunks, data_block: DataBlock, + filter: ColumnRef, }, UpdateData(DataBlock), NeedSerialize(DataBlock), @@ -170,19 +170,26 @@ impl Processor for UpdateSource { fn process(&mut self) -> Result<()> { match std::mem::replace(&mut self.state, State::Finish) { State::FilterData(part, chunks) => { - let mut data_block = self.block_reader.deserialize(part.clone(), chunks)?; + let data_block = self.block_reader.deserialize(part.clone(), chunks)?; if let Some(filter) = self.filter.as_ref() { let filter_result = filter .eval(&self.ctx.try_get_function_context()?, &data_block)? .vector; - let predicates = DataBlock::cast_to_nonull_boolean(&filter_result)?; - if DataBlock::filter_exists(&predicates)? { - let field = DataField::new("_predicate", bool::to_data_type()); - data_block = data_block.add_column(predicates, field)?; + let filter = DataBlock::cast_to_nonull_boolean(&filter_result)?; + if DataBlock::filter_exists(&filter)? { if self.remain_reader.is_none() { - self.state = State::UpdateData(data_block); + self.state = State::MergeRemain { + part, + chunks: vec![], + data_block, + filter, + }; } else { - self.state = State::ReadRemain(part, data_block); + self.state = State::ReadRemain { + part, + data_block, + filter, + }; } } else { self.state = State::Generated(Mutation::DoNothing); @@ -195,10 +202,9 @@ impl Processor for UpdateSource { part, chunks, mut data_block, + filter, } => { - let merged = if chunks.is_empty() { - data_block - } else if let Some(remain_reader) = self.remain_reader.as_ref() { + if let Some(remain_reader) = self.remain_reader.as_ref() { let remain_block = remain_reader.deserialize(part, chunks)?; for (col, field) in remain_block .columns() @@ -207,11 +213,12 @@ impl Processor for UpdateSource { { data_block = data_block.add_column(col.clone(), field.clone())?; } - data_block - } else { - return Err(ErrorCode::Internal("It's a bug. Need remain reader")); - }; - self.state = State::UpdateData(merged); + } + + let field = DataField::new("_predicate", bool::to_data_type()); + data_block = data_block.add_column(filter, field)?; + + self.state = State::UpdateData(data_block); } State::UpdateData(data_block) => { let func_ctx = self.ctx.try_get_function_context()?; @@ -289,7 +296,11 @@ impl Processor for UpdateSource { .await?; self.state = State::FilterData(inner_part, chunks); } - State::ReadRemain(part, data_block) => { + State::ReadRemain { + part, + data_block, + filter, + } => { if let Some(remain_reader) = self.remain_reader.as_ref() { let chunks = remain_reader .read_columns_data(self.ctx.clone(), part.clone()) @@ -298,6 +309,7 @@ impl Processor for UpdateSource { part, chunks, data_block, + filter, }; } else { return Err(ErrorCode::Internal("It's a bug. No remain reader")); diff --git a/src/query/storages/fuse/src/operations/mutation/util.rs b/src/query/storages/fuse/src/operations/mutation/util.rs new file mode 100644 index 000000000000..e54ffc800200 --- /dev/null +++ b/src/query/storages/fuse/src/operations/mutation/util.rs @@ -0,0 +1,22 @@ +// Copyright 2023 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub type DataChunks = Vec<(usize, Vec)>; + +pub struct SerializeState { + pub block_data: Vec, + pub block_location: String, + pub index_data: Vec, + pub index_location: String, +} diff --git a/src/query/storages/fuse/src/operations/update.rs b/src/query/storages/fuse/src/operations/update.rs index d157e58a3a83..b1eec354d28c 100644 --- a/src/query/storages/fuse/src/operations/update.rs +++ b/src/query/storages/fuse/src/operations/update.rs @@ -35,6 +35,15 @@ use crate::pruning::BlockPruner; use crate::FuseTable; impl FuseTable { + /// UPDATE column = expression WHERE condition + /// The flow of Pipeline is as follows: + /// +-------------+ + /// |UpdateSource1| ------ + /// +-------------+ | +-----------------+ +------------+ + /// | ... | ... | ---> |MutationTransform| ---> |MutationSink| + /// +-------------+ | +-----------------+ +------------+ + /// |UpdateSourceN| ------ + /// +-------------+ pub async fn do_update( &self, ctx: Arc, @@ -65,7 +74,7 @@ impl FuseTable { let mut remain_reader = None; let (projection, filters) = if col_indices.is_empty() { if filter.is_some() && !self.try_eval_const(&filter.unwrap())? { - // do nothing. + // The condition is always false, do nothing. return Ok(()); } @@ -76,6 +85,9 @@ impl FuseTable { acc }); + // The condition is always true. + // Replace column to the result of the following expression: + // CAST(expression, type) for (id, expr) in update_list.into_iter() { let field = schema.field(id); let target = field.data_type(); @@ -99,11 +111,6 @@ impl FuseTable { acc }); - let mut fields = schema.fields().clone(); - fields.push(DataField::new("_predicate", bool::to_data_type())); - let input_schema = Arc::new(DataSchema::new(fields)); - pos += 1; - let remain_col_ids: Vec = all_col_ids .into_iter() .filter(|id| !col_indices.contains(id)) @@ -119,6 +126,13 @@ impl FuseTable { Some((*self.create_block_reader(Projection::Columns(remain_col_ids))?).clone()); } + let mut fields = schema.fields().clone(); + fields.push(DataField::new("_predicate", bool::to_data_type())); + let input_schema = Arc::new(DataSchema::new(fields)); + pos += 1; + + // Replace column to the result of the following expression: + // if(condition, CAST(expression, type), column) for (id, expr) in update_list.into_iter() { let field = schema.field(id); let target = field.data_type(); From bdbc7db2d7b201ea57acb78d49f6f0898207564c Mon Sep 17 00:00:00 2001 From: zhyass Date: Tue, 3 Jan 2023 19:10:00 +0800 Subject: [PATCH 11/26] fix conflict --- .../mutation/update/update_source.rs | 41 ++++++++++++++++--- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs index 05c73ac14bd7..346ae581ff14 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs @@ -30,8 +30,10 @@ use common_storages_table_meta::meta::BlockMeta; use common_storages_table_meta::table::TableCompression; use opendal::Operator; +use crate::fuse_part::FusePartInfo; use crate::io::write_data; use crate::io::BlockReader; +use crate::io::ReadSettings; use crate::io::TableMetaLocationGenerator; use crate::operations::mutation::DataChunks; use crate::operations::mutation::Mutation; @@ -170,7 +172,9 @@ impl Processor for UpdateSource { fn process(&mut self) -> Result<()> { match std::mem::replace(&mut self.state, State::Finish) { State::FilterData(part, chunks) => { - let data_block = self.block_reader.deserialize(part.clone(), chunks)?; + let data_block = self + .block_reader + .deserialize_parquet_chunks(part.clone(), chunks)?; if let Some(filter) = self.filter.as_ref() { let filter_result = filter .eval(&self.ctx.try_get_function_context()?, &data_block)? @@ -205,7 +209,7 @@ impl Processor for UpdateSource { filter, } => { if let Some(remain_reader) = self.remain_reader.as_ref() { - let remain_block = remain_reader.deserialize(part, chunks)?; + let remain_block = remain_reader.deserialize_parquet_chunks(part, chunks)?; for (col, field) in remain_block .columns() .iter() @@ -287,13 +291,25 @@ impl Processor for UpdateSource { async fn async_process(&mut self) -> Result<()> { match std::mem::replace(&mut self.state, State::Finish) { State::ReadData(Some(part)) => { + let settings = ReadSettings::from_ctx(&self.ctx)?; let part = MutationPartInfo::from_part(&part)?; self.index = part.index; let inner_part = part.inner_part.clone(); - let chunks = self + let fuse_part = FusePartInfo::from_part(&inner_part)?; + + let read_res = self .block_reader - .read_columns_data(self.ctx.clone(), inner_part.clone()) + .read_columns_data_by_merge_io( + &settings, + &fuse_part.location, + &fuse_part.columns_meta, + ) .await?; + let chunks = read_res + .columns_chunks()? + .into_iter() + .map(|(column_idx, column_chunk)| (column_idx, column_chunk.to_vec())) + .collect::>(); self.state = State::FilterData(inner_part, chunks); } State::ReadRemain { @@ -302,9 +318,22 @@ impl Processor for UpdateSource { filter, } => { if let Some(remain_reader) = self.remain_reader.as_ref() { - let chunks = remain_reader - .read_columns_data(self.ctx.clone(), part.clone()) + let fuse_part = FusePartInfo::from_part(&part)?; + + let settings = ReadSettings::from_ctx(&self.ctx)?; + let read_res = remain_reader + .read_columns_data_by_merge_io( + &settings, + &fuse_part.location, + &fuse_part.columns_meta, + ) .await?; + let chunks = read_res + .columns_chunks()? + .into_iter() + .map(|(column_idx, column_chunk)| (column_idx, column_chunk.to_vec())) + .collect::>(); + self.state = State::MergeRemain { part, chunks, From 4d25c4f282dac081ef6f3269fac1c096e3ab937b Mon Sep 17 00:00:00 2001 From: zhyass Date: Tue, 3 Jan 2023 20:29:31 +0800 Subject: [PATCH 12/26] Add sqllogic test --- .../suites/base/03_dml/03_0035_update | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tests/sqllogictests/suites/base/03_dml/03_0035_update diff --git a/tests/sqllogictests/suites/base/03_dml/03_0035_update b/tests/sqllogictests/suites/base/03_dml/03_0035_update new file mode 100644 index 000000000000..3fccb9d507d2 --- /dev/null +++ b/tests/sqllogictests/suites/base/03_dml/03_0035_update @@ -0,0 +1,32 @@ +statement ok +DROP DATABASE IF EXISTS db1 + +statement ok +CREATE DATABASE db1 + +statement ok +USE db1 + +statement ok +CREATE TABLE IF NOT EXISTS t(a Int, b Date) + +statement ok +INSERT INTO t VALUES(1, '2022-12-30') + +statement ok +INSERT INTO t VALUES(2, '2023-01-01') + +statement ok +UPDATE t SET a = 3 WHERE b > '2022-12-31' + +query IT +SELECT * FROM t ORDER BY b +---- +1 2022-12-30 +3 2023-01-01 + +statement ok +drop table t all + +statement ok +DROP DATABASE db1 From e99bea025da194f3df0d0464234376cf349ac461 Mon Sep 17 00:00:00 2001 From: zhyass Date: Tue, 3 Jan 2023 22:19:33 +0800 Subject: [PATCH 13/26] add scan progress --- .../mutation/update/update_source.rs | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs index 346ae581ff14..34e91f66fef5 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs @@ -15,12 +15,12 @@ use std::any::Any; use std::sync::Arc; +use common_base::base::Progress; +use common_base::base::ProgressValues; use common_catalog::plan::PartInfoPtr; use common_catalog::table_context::TableContext; use common_datablocks::DataBlock; -use common_datavalues::ColumnRef; -use common_datavalues::DataField; -use common_datavalues::ToDataType; +use common_datavalues::prelude::*; use common_exception::ErrorCode; use common_exception::Result; use common_sql::evaluator::ChunkOperator; @@ -75,6 +75,7 @@ enum State { pub struct UpdateSource { state: State, ctx: Arc, + scan_progress: Arc, output: Arc, location_gen: TableMetaLocationGenerator, dal: Operator, @@ -98,9 +99,11 @@ impl UpdateSource { remain_reader: Arc>, operators: Vec, ) -> Result { + let scan_progress = ctx.get_scan_progress(); Ok(ProcessorPtr::create(Box::new(UpdateSource { state: State::ReadData(None), ctx: ctx.clone(), + scan_progress, output, location_gen: table.meta_location_generator().clone(), dal: table.get_operator(), @@ -181,6 +184,13 @@ impl Processor for UpdateSource { .vector; let filter = DataBlock::cast_to_nonull_boolean(&filter_result)?; if DataBlock::filter_exists(&filter)? { + let col: &BooleanColumn = Series::check_get(&filter)?; + let progress_values = ProgressValues { + rows: col.len() - col.values().unset_bits(), + bytes: 0, + }; + self.scan_progress.incr(&progress_values); + if self.remain_reader.is_none() { self.state = State::MergeRemain { part, @@ -199,6 +209,11 @@ impl Processor for UpdateSource { self.state = State::Generated(Mutation::DoNothing); } } else { + let progress_values = ProgressValues { + rows: data_block.num_rows(), + bytes: 0, + }; + self.scan_progress.incr(&progress_values); self.state = State::UpdateData(data_block); } } From 3a707a59bff0fd4de0e8a6e0032d05ef89a6b799 Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 6 Jan 2023 16:49:31 +0800 Subject: [PATCH 14/26] Add serialize data transform --- .../mutation/deletion/deletion_source.rs | 7 +- .../fuse/src/operations/mutation/mod.rs | 5 +- .../src/operations/mutation/mutation_meta.rs | 63 +++++- .../operations/mutation/mutation_source.rs | 111 ++++++++++ .../operations/mutation/mutation_transform.rs | 4 +- .../mutation/serialize_data_transform.rs | 209 ++++++++++++++++++ .../mutation/update/update_source.rs | 4 +- .../storages/fuse/src/operations/read/mod.rs | 1 + 8 files changed, 387 insertions(+), 17 deletions(-) create mode 100644 src/query/storages/fuse/src/operations/mutation/mutation_source.rs create mode 100644 src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs diff --git a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs b/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs index dfa887d0c565..925edb477404 100644 --- a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs @@ -40,7 +40,7 @@ use crate::io::TableMetaLocationGenerator; use crate::operations::mutation::DataChunks; use crate::operations::mutation::Mutation; use crate::operations::mutation::MutationPartInfo; -use crate::operations::mutation::MutationSourceMeta; +use crate::operations::mutation::MutationTransformMeta; use crate::operations::mutation::SerializeState; use crate::operations::util; use crate::operations::BloomIndexState; @@ -199,8 +199,7 @@ impl Processor for DeletionSource { // none of the rows should be removed. self.state = State::Generated(Mutation::DoNothing); } else if self.remain_reader.is_none() { - let block = data_block.resort(self.output_schema.clone())?; - self.state = State::NeedSerialize(block); + self.state = State::NeedSerialize(data_block); } else { self.state = State::ReadRemain { part, @@ -287,7 +286,7 @@ impl Processor for DeletionSource { ); } State::Generated(op) => { - let meta = MutationSourceMeta::create(self.index, op); + let meta = MutationTransformMeta::create(self.index, op); let new_part = self.ctx.try_get_part(); self.state = State::Output(new_part, DataBlock::empty_with_meta(meta)); } diff --git a/src/query/storages/fuse/src/operations/mutation/mod.rs b/src/query/storages/fuse/src/operations/mutation/mod.rs index a672d2ffda59..1de8091b487b 100644 --- a/src/query/storages/fuse/src/operations/mutation/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/mod.rs @@ -19,8 +19,10 @@ mod deletion; pub mod mutation_meta; mod mutation_part; pub mod mutation_sink; +mod mutation_source; mod mutation_transform; pub mod recluster_mutator; +mod serialize_data_transform; mod update; mod util; @@ -36,7 +38,8 @@ pub use compact::SegmentCompactor; pub use deletion::DeletionSource; pub use mutation_meta::Mutation; pub use mutation_meta::MutationSinkMeta; -pub use mutation_meta::MutationSourceMeta; +pub use mutation_meta::MutationTransformMeta; +pub use mutation_meta::SerializeDataMeta; pub use mutation_part::MutationPartInfo; pub use mutation_sink::MutationSink; pub use mutation_transform::MutationTransform; diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs b/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs index e64444330e46..869d56567cfa 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs @@ -20,12 +20,59 @@ use common_datablocks::BlockMetaInfoPtr; use common_exception::ErrorCode; use common_exception::Result; use common_storages_table_meta::meta::BlockMeta; +use common_storages_table_meta::meta::ClusterStatistics; use common_storages_table_meta::meta::Location; use common_storages_table_meta::meta::Statistics; use crate::operations::mutation::AbortOperation; use crate::pruning::BlockIndex; +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)] +pub struct SerializeDataMeta { + pub index: BlockIndex, + pub cluster_stats: Option, +} + +#[typetag::serde(name = "serialize_data_meta")] +impl BlockMetaInfo for SerializeDataMeta { + fn as_any(&self) -> &dyn Any { + self + } + + fn as_mut_any(&mut self) -> &mut dyn Any { + self + } + + fn clone_self(&self) -> Box { + Box::new(self.clone()) + } + + fn equals(&self, info: &Box) -> bool { + match info.as_any().downcast_ref::() { + None => false, + Some(other) => self == other, + } + } +} + +impl SerializeDataMeta { + pub fn create(index: BlockIndex, cluster_stats: Option) -> BlockMetaInfoPtr { + Box::new(SerializeDataMeta { + index, + cluster_stats, + }) + } + + pub fn from_meta(info: &BlockMetaInfoPtr) -> Result<&SerializeDataMeta> { + match info.as_any().downcast_ref::() { + Some(part_ref) => Ok(part_ref), + None => Err(ErrorCode::Internal( + "Cannot downcast from BlockMetaInfo to SerializeDataMeta.", + )), + } + } +} + #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] pub enum Mutation { DoNothing, @@ -34,13 +81,13 @@ pub enum Mutation { } #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)] -pub struct MutationSourceMeta { +pub struct MutationTransformMeta { pub index: BlockIndex, pub op: Mutation, } #[typetag::serde(name = "mutation_source_meta")] -impl BlockMetaInfo for MutationSourceMeta { +impl BlockMetaInfo for MutationTransformMeta { fn as_any(&self) -> &dyn Any { self } @@ -54,23 +101,23 @@ impl BlockMetaInfo for MutationSourceMeta { } fn equals(&self, info: &Box) -> bool { - match info.as_any().downcast_ref::() { + match info.as_any().downcast_ref::() { None => false, Some(other) => self == other, } } } -impl MutationSourceMeta { +impl MutationTransformMeta { pub fn create(index: BlockIndex, op: Mutation) -> BlockMetaInfoPtr { - Box::new(MutationSourceMeta { index, op }) + Box::new(MutationTransformMeta { index, op }) } - pub fn from_meta(info: &BlockMetaInfoPtr) -> Result<&MutationSourceMeta> { - match info.as_any().downcast_ref::() { + pub fn from_meta(info: &BlockMetaInfoPtr) -> Result<&MutationTransformMeta> { + match info.as_any().downcast_ref::() { Some(part_ref) => Ok(part_ref), None => Err(ErrorCode::Internal( - "Cannot downcast from BlockMetaInfo to MutationSourceMeta.", + "Cannot downcast from BlockMetaInfo to MutationTransformMeta.", )), } } diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_source.rs b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs new file mode 100644 index 000000000000..6f4db9e57279 --- /dev/null +++ b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs @@ -0,0 +1,111 @@ +// Copyright 2023 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::sync::Arc; + +use common_base::base::tokio; +use common_catalog::plan::PartInfoPtr; +use common_catalog::table_context::TableContext; +use common_datablocks::DataBlock; +use common_exception::Result; + +use crate::fuse_part::FusePartInfo; +use crate::io::BlockReader; +use crate::io::ReadSettings; +use crate::operations::mutation::MutationPartInfo; +use crate::operations::read::DataSourceMeta; +use crate::pipelines::processors::port::OutputPort; +use crate::pipelines::processors::processor::Event; +use crate::pipelines::processors::Processor; +use crate::MergeIOReadResult; + +pub struct MutationSource { + finished: bool, + ctx: Arc, + batch_size: usize, + block_reader: Arc, + + output: Arc, + output_data: Option<(Vec, Vec)>, +} + +#[async_trait::async_trait] +impl Processor for MutationSource { + fn name(&self) -> String { + String::from("MutationSource") + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if self.finished { + self.output.finish(); + return Ok(Event::Finished); + } + + if self.output.is_finished() { + return Ok(Event::Finished); + } + + if !self.output.can_push() { + return Ok(Event::NeedConsume); + } + + if let Some((part, data)) = self.output_data.take() { + let output = DataBlock::empty_with_meta(DataSourceMeta::create(part, data)); + self.output.push_data(Ok(output)); + } + + Ok(Event::Async) + } + + async fn async_process(&mut self) -> Result<()> { + let parts = self.ctx.try_get_parts(self.batch_size); + + if !parts.is_empty() { + let mut chunks = Vec::with_capacity(parts.len()); + for part in &parts { + let part = part.clone(); + let block_reader = self.block_reader.clone(); + let settings = ReadSettings::from_ctx(&self.ctx)?; + + chunks.push(async move { + tokio::spawn(async move { + let deletion_part = MutationPartInfo::from_part(&part)?; + let fuse_part = FusePartInfo::from_part(&deletion_part.inner_part)?; + + block_reader + .read_columns_data_by_merge_io( + &settings, + &fuse_part.location, + &fuse_part.columns_meta, + ) + .await + }) + .await + .unwrap() + }); + } + + self.output_data = Some((parts, futures::future::try_join_all(chunks).await?)); + return Ok(()); + } + + self.finished = true; + Ok(()) + } +} diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_transform.rs b/src/query/storages/fuse/src/operations/mutation/mutation_transform.rs index 1efc7347fb01..741eeca58701 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_transform.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_transform.rs @@ -36,7 +36,7 @@ use crate::io::TableMetaLocationGenerator; use crate::operations::mutation::AbortOperation; use crate::operations::mutation::Mutation; use crate::operations::mutation::MutationSinkMeta; -use crate::operations::mutation::MutationSourceMeta; +use crate::operations::mutation::MutationTransformMeta; use crate::pipelines::processors::port::InputPort; use crate::pipelines::processors::port::OutputPort; use crate::pipelines::processors::processor::Event; @@ -228,7 +228,7 @@ impl Processor for MutationTransform { .get_meta() .cloned() .ok_or_else(|| ErrorCode::Internal("No block meta. It's a bug"))?; - let meta = MutationSourceMeta::from_meta(&input_meta)?; + let meta = MutationTransformMeta::from_meta(&input_meta)?; match &meta.op { Mutation::Replaced(block_meta) => { self.input_metas diff --git a/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs b/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs new file mode 100644 index 000000000000..7991904d2dd3 --- /dev/null +++ b/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs @@ -0,0 +1,209 @@ +// Copyright 2023 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::sync::Arc; + +use common_datablocks::DataBlock; +use common_exception::ErrorCode; +use common_exception::Result; +use common_pipeline_core::processors::port::InputPort; +use common_storages_common::blocks_to_parquet; +use common_storages_table_meta::meta::BlockMeta; +use common_storages_table_meta::meta::ClusterStatistics; +use common_storages_table_meta::table::TableCompression; +use opendal::Operator; + +use crate::io::write_data; +use crate::io::TableMetaLocationGenerator; +use crate::operations::mutation::Mutation; +use crate::operations::mutation::MutationTransformMeta; +use crate::operations::mutation::SerializeDataMeta; +use crate::operations::mutation::SerializeState; +use crate::operations::util; +use crate::operations::BloomIndexState; +use crate::pipelines::processors::port::OutputPort; +use crate::pipelines::processors::processor::Event; +use crate::pipelines::processors::Processor; +use crate::pruning::BlockIndex; +use crate::statistics::gen_columns_statistics; +use crate::statistics::ClusterStatsGenerator; + +enum State { + Consume, + NeedSerialize(DataBlock), + Serialized(SerializeState, Arc), + Output(Mutation), +} + +pub struct SerializeDataTransform { + state: State, + input: Arc, + output: Arc, + output_data: Option, + + location_gen: TableMetaLocationGenerator, + dal: Operator, + cluster_stats_gen: ClusterStatsGenerator, + + index: BlockIndex, + origin_stats: Option, + table_compression: TableCompression, +} + +#[async_trait::async_trait] +impl Processor for SerializeDataTransform { + fn name(&self) -> String { + "SerializeDataTransform".to_string() + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn event(&mut self) -> Result { + if matches!(self.state, State::NeedSerialize(_) | State::Output(_)) { + return Ok(Event::Sync); + } + + if matches!(self.state, State::Serialized(_, _)) { + return Ok(Event::Async); + } + + if self.output.is_finished() { + return Ok(Event::Finished); + } + + if !self.output.can_push() { + return Ok(Event::NeedConsume); + } + + if let Some(data_block) = self.output_data.take() { + self.output.push_data(Ok(data_block)); + return Ok(Event::NeedConsume); + } + + if self.input.is_finished() { + self.output.finish(); + return Ok(Event::Finished); + } + + if !self.input.has_data() { + self.input.set_need_data(); + return Ok(Event::NeedData); + } + + let mut input_data = self.input.pull_data().unwrap()?; + let meta = input_data.take_meta(); + if meta.is_none() { + self.state = State::Output(Mutation::DoNothing); + } else { + let meta = meta.unwrap(); + let meta = SerializeDataMeta::from_meta(&meta)?; + self.index = meta.index; + self.origin_stats = meta.cluster_stats.clone(); + if input_data.is_empty() { + self.state = State::Output(Mutation::Deleted); + } else { + self.state = State::NeedSerialize(input_data); + } + } + Ok(Event::Sync) + } + + fn process(&mut self) -> Result<()> { + match std::mem::replace(&mut self.state, State::Consume) { + State::NeedSerialize(block) => { + let cluster_stats = self + .cluster_stats_gen + .gen_with_origin_stats(&block, std::mem::take(&mut self.origin_stats))?; + + let row_count = block.num_rows() as u64; + let block_size = block.memory_size() as u64; + let (block_location, block_id) = self.location_gen.gen_block_location(); + + // build block index. + let location = self.location_gen.block_bloom_index_location(&block_id); + let (bloom_index_state, column_distinct_count) = + BloomIndexState::try_create(&block, location)?; + let col_stats = gen_columns_statistics(&block, Some(column_distinct_count))?; + + // serialize data block. + let mut block_data = Vec::with_capacity(100 * 1024 * 1024); + let schema = block.schema().clone(); + let (file_size, meta_data) = blocks_to_parquet( + &schema, + vec![block], + &mut block_data, + self.table_compression, + )?; + let col_metas = util::column_metas(&meta_data)?; + + // new block meta. + let new_meta = Arc::new(BlockMeta::new( + row_count, + block_size, + file_size, + col_stats, + col_metas, + cluster_stats, + block_location.clone(), + Some(bloom_index_state.location.clone()), + bloom_index_state.size, + self.table_compression.into(), + )); + + self.state = State::Serialized( + SerializeState { + block_data, + block_location: block_location.0, + index_data: bloom_index_state.data, + index_location: bloom_index_state.location.0, + }, + new_meta, + ); + } + State::Output(op) => { + let meta = MutationTransformMeta::create(self.index, op); + self.output_data = Some(DataBlock::empty_with_meta(meta)); + } + _ => return Err(ErrorCode::Internal("It's a bug.")), + } + Ok(()) + } + + async fn async_process(&mut self) -> Result<()> { + match std::mem::replace(&mut self.state, State::Consume) { + State::Serialized(serialize_state, block_meta) => { + // write block data. + write_data( + &serialize_state.block_data, + &self.dal, + &serialize_state.block_location, + ) + .await?; + // write index data. + write_data( + &serialize_state.index_data, + &self.dal, + &serialize_state.index_location, + ) + .await?; + self.state = State::Output(Mutation::Replaced(block_meta)); + } + _ => return Err(ErrorCode::Internal("It's a bug.")), + } + Ok(()) + } +} diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs index 34e91f66fef5..b7831243f6e8 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs @@ -38,7 +38,7 @@ use crate::io::TableMetaLocationGenerator; use crate::operations::mutation::DataChunks; use crate::operations::mutation::Mutation; use crate::operations::mutation::MutationPartInfo; -use crate::operations::mutation::MutationSourceMeta; +use crate::operations::mutation::MutationTransformMeta; use crate::operations::mutation::SerializeState; use crate::operations::util; use crate::operations::BloomIndexState; @@ -294,7 +294,7 @@ impl Processor for UpdateSource { ); } State::Generated(op) => { - let meta = MutationSourceMeta::create(self.index, op); + let meta = MutationTransformMeta::create(self.index, op); let new_part = self.ctx.try_get_part(); self.state = State::Output(new_part, DataBlock::empty_with_meta(meta)); } diff --git a/src/query/storages/fuse/src/operations/read/mod.rs b/src/query/storages/fuse/src/operations/read/mod.rs index 3279f6c5efc1..6b0f87d32527 100644 --- a/src/query/storages/fuse/src/operations/read/mod.rs +++ b/src/query/storages/fuse/src/operations/read/mod.rs @@ -21,3 +21,4 @@ mod parquet_data_source_deserializer; mod parquet_data_source_reader; pub use fuse_source::build_fuse_parquet_source_pipeline; +pub use parquet_data_source::DataSourceMeta; From 2e8e87e902bd58202474fbbdd9a9d7a08a117e92 Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 6 Jan 2023 17:02:51 +0800 Subject: [PATCH 15/26] resolve conflict --- .../mutation/deletion/deletion_meta.rs | 74 ------------------- .../src/operations/mutation/mutation_meta.rs | 8 +- 2 files changed, 2 insertions(+), 80 deletions(-) delete mode 100644 src/query/storages/fuse/src/operations/mutation/deletion/deletion_meta.rs diff --git a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_meta.rs b/src/query/storages/fuse/src/operations/mutation/deletion/deletion_meta.rs deleted file mode 100644 index 11b1c135c505..000000000000 --- a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_meta.rs +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::sync::Arc; - -use common_exception::ErrorCode; -use common_exception::Result; -use common_expression::BlockMetaInfo; -use common_expression::BlockMetaInfoPtr; -use common_storages_table_meta::meta::BlockMeta; - -use crate::pruning::BlockIndex; - -#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] -pub enum Deletion { - DoNothing, - Replaced(Arc), - Deleted, -} - -#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)] -pub struct DeletionSourceMeta { - pub index: BlockIndex, - pub op: Deletion, -} - -#[typetag::serde(name = "deletion_source_meta")] -impl BlockMetaInfo for DeletionSourceMeta { - fn as_any(&self) -> &dyn Any { - self - } - - fn as_mut_any(&mut self) -> &mut dyn Any { - self - } - - fn clone_self(&self) -> Box { - Box::new(self.clone()) - } - - fn equals(&self, info: &Box) -> bool { - match info.as_any().downcast_ref::() { - None => false, - Some(other) => self == other, - } - } -} - -impl DeletionSourceMeta { - pub fn create(index: BlockIndex, op: Deletion) -> BlockMetaInfoPtr { - Box::new(DeletionSourceMeta { index, op }) - } - - pub fn from_meta(info: &BlockMetaInfoPtr) -> Result<&DeletionSourceMeta> { - match info.as_any().downcast_ref::() { - Some(part_ref) => Ok(part_ref), - None => Err(ErrorCode::Internal( - "Cannot downcast from BlockMetaInfo to DeletionSourceMeta.", - )), - } - } -} diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs b/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs index 61be2bbcde6a..77fc0ac8d712 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs @@ -17,10 +17,10 @@ use std::sync::Arc; use common_exception::ErrorCode; use common_exception::Result; -use common_storages_table_meta::meta::BlockMeta; -use common_storages_table_meta::meta::ClusterStatistics; use common_expression::BlockMetaInfo; use common_expression::BlockMetaInfoPtr; +use common_storages_table_meta::meta::BlockMeta; +use common_storages_table_meta::meta::ClusterStatistics; use common_storages_table_meta::meta::Location; use common_storages_table_meta::meta::Statistics; @@ -169,11 +169,7 @@ impl MutationSinkMeta { match info.as_any().downcast_ref::() { Some(part_ref) => Ok(part_ref), None => Err(ErrorCode::Internal( -<<<<<<< HEAD "Cannot downcast from BlockMetaInfo to MutationSinkMeta.", -======= - "Cannot downcast from ChunkMetaInfo to MutationMeta.", ->>>>>>> upstream/main )), } } From 9fe6f0e958b37595df24a8aab1482167cf673e22 Mon Sep 17 00:00:00 2001 From: zhyass Date: Wed, 11 Jan 2023 01:26:02 +0800 Subject: [PATCH 16/26] add mutation source --- src/query/catalog/src/table.rs | 4 +- .../src/interpreters/interpreter_delete.rs | 1 - src/query/sql/src/planner/plans/update.rs | 4 +- src/query/storages/fuse/src/fuse_table.rs | 4 +- .../storages/fuse/src/operations/delete.rs | 51 ++- .../mutation/deletion/deletion_source.rs | 163 ++------ .../fuse/src/operations/mutation/mod.rs | 3 + .../src/operations/mutation/mutation_meta.rs | 2 +- .../operations/mutation/mutation_source.rs | 349 ++++++++++++++++-- .../mutation/serialize_data_transform.rs | 44 ++- .../mutation/update/update_source.rs | 296 +++++++-------- 11 files changed, 573 insertions(+), 348 deletions(-) diff --git a/src/query/catalog/src/table.rs b/src/query/catalog/src/table.rs index ed28d8a1b8d5..c31d033f3658 100644 --- a/src/query/catalog/src/table.rs +++ b/src/query/catalog/src/table.rs @@ -243,9 +243,9 @@ pub trait Table: Sync + Send { async fn update( &self, ctx: Arc, - filter: Option, + filter: Option>, col_indices: Vec, - update_list: Vec<(usize, Expression)>, + update_list: Vec<(usize, RemoteExpr)>, pipeline: &mut Pipeline, ) -> Result<()> { let (_, _, _, _, _) = (ctx, filter, col_indices, update_list, pipeline); diff --git a/src/query/service/src/interpreters/interpreter_delete.rs b/src/query/service/src/interpreters/interpreter_delete.rs index 2895e2d444f8..097913d8ea99 100644 --- a/src/query/service/src/interpreters/interpreter_delete.rs +++ b/src/query/service/src/interpreters/interpreter_delete.rs @@ -25,7 +25,6 @@ use crate::pipelines::executor::PipelineCompleteExecutor; use crate::pipelines::PipelineBuildResult; use crate::sessions::QueryContext; use crate::sessions::TableContext; -use crate::sql::executor::ExpressionBuilderWithoutRenaming; use crate::sql::plans::DeletePlan; use crate::sql::plans::ScalarExpr; diff --git a/src/query/sql/src/planner/plans/update.rs b/src/query/sql/src/planner/plans/update.rs index 9f7fc58cff23..186afc9a9e81 100644 --- a/src/query/sql/src/planner/plans/update.rs +++ b/src/query/sql/src/planner/plans/update.rs @@ -15,8 +15,8 @@ use std::collections::HashMap; use std::sync::Arc; -use common_datavalues::DataSchema; -use common_datavalues::DataSchemaRef; +use common_expression::DataSchema; +use common_expression::DataSchemaRef; use common_meta_types::MetaId; use crate::plans::Scalar; diff --git a/src/query/storages/fuse/src/fuse_table.rs b/src/query/storages/fuse/src/fuse_table.rs index 504d7e0cece8..f87f3f506da2 100644 --- a/src/query/storages/fuse/src/fuse_table.rs +++ b/src/query/storages/fuse/src/fuse_table.rs @@ -540,9 +540,9 @@ impl Table for FuseTable { async fn update( &self, ctx: Arc, - filter: Option, + filter: Option>, col_indices: Vec, - update_list: Vec<(usize, Expression)>, + update_list: Vec<(usize, RemoteExpr)>, pipeline: &mut Pipeline, ) -> Result<()> { self.do_update(ctx, filter, col_indices, update_list, pipeline) diff --git a/src/query/storages/fuse/src/operations/delete.rs b/src/query/storages/fuse/src/operations/delete.rs index 33d6ed955c44..93e9082db99f 100644 --- a/src/query/storages/fuse/src/operations/delete.rs +++ b/src/query/storages/fuse/src/operations/delete.rs @@ -39,9 +39,11 @@ use common_sql::evaluator::BlockOperator; use common_storages_table_meta::meta::Location; use common_storages_table_meta::meta::TableSnapshot; -use crate::operations::mutation::DeletionSource; +use super::mutation::SerializeDataTransform; +use crate::operations::mutation::MutationAction; use crate::operations::mutation::MutationPartInfo; use crate::operations::mutation::MutationSink; +use crate::operations::mutation::MutationSource; use crate::operations::mutation::MutationTransform; use crate::pipelines::processors::port::InputPort; use crate::pipelines::processors::port::OutputPort; @@ -53,13 +55,13 @@ use crate::FuseTable; impl FuseTable { /// The flow of Pipeline is as follows: - /// +---------------+ - /// |DeletionSource1| ------ - /// +---------------+ | +-----------------+ +------------+ - /// | ... | ... | ---> |MutationTransform| ---> |MutationSink| - /// +---------------+ | +-----------------+ +------------+ - /// |DeletionSourceN| ------ - /// +---------------+ + /// +---------------+ +-----------------------+ + /// |MutationSource1| ---> |SerializeDataTransform1| ------ + /// +---------------+ +-----------------------+ | +-----------------+ +------------+ + /// | ... | ---> | ... | ... | ---> |MutationTransform| ---> |MutationSink| + /// +---------------+ +-----------------------+ | +-----------------+ +------------+ + /// |MutationSourceN| ---> |SerializeDataTransformN| ------ + /// +---------------+ +-----------------------+ pub async fn do_delete( &self, ctx: Arc, @@ -105,7 +107,7 @@ impl FuseTable { // if the `filter_expr` is of "constant" nullary : // for the whole block, whether all of the rows should be kept or dropped, // we can just return from here, without accessing the block data - if self.try_eval_const(ctx.clone(), &self.table_info.schema(), &filter_expr)? { + if self.try_eval_const(ctx.clone(), &self.schema(), &filter_expr)? { let progress_values = ProgressValues { rows: snapshot.summary.row_count as usize, bytes: snapshot.summary.uncompressed_byte_size as usize, @@ -123,6 +125,17 @@ impl FuseTable { self.try_add_deletion_source(ctx.clone(), &filter_expr, col_indices, &snapshot, pipeline) .await?; + let cluster_stats_gen = self.cluster_stats_gen(ctx.clone())?; + pipeline.add_transform(|input, output| { + SerializeDataTransform::try_create( + ctx.clone(), + input, + output, + self, + cluster_stats_gen.clone(), + ) + })?; + self.try_add_mutation_transform(ctx.clone(), snapshot.segments.clone(), pipeline)?; pipeline.add_sink(|input| { @@ -218,30 +231,44 @@ impl FuseTable { ctx.try_set_partitions(parts)?; let block_reader = self.create_block_reader(projection.clone())?; + let schema = block_reader.schema(); + let filter = + Arc::new(filter.as_expr(&BUILTIN_FUNCTIONS).map(|expr| { + expr.project_column_ref(|name| schema.column_with_name(name).unwrap().0) + })); + let all_col_ids = self.all_the_columns_ids(); let remain_col_ids: Vec = all_col_ids .into_iter() .filter(|id| !col_indices.contains(id)) .collect(); + let mut source_col_ids = col_indices; let remain_reader = if remain_col_ids.is_empty() { Arc::new(None) } else { + source_col_ids.extend_from_slice(&remain_col_ids); Arc::new(Some( (*self.create_block_reader(Projection::Columns(remain_col_ids))?).clone(), )) }; + // resort the block. + let mut projection = (0..source_col_ids.len()).collect::>(); + projection.sort_by_key(|&i| source_col_ids[i]); + let ops = vec![BlockOperator::Project { projection }]; + let max_threads = ctx.get_settings().get_max_threads()? as usize; // Add source pipe. pipeline.add_source( |output| { - DeletionSource::try_create( + MutationSource::try_create( ctx.clone(), + MutationAction::Deletion, output, - self, + filter.clone(), block_reader.clone(), - Arc::new(filter.clone()), remain_reader.clone(), + ops.clone(), ) }, max_threads, diff --git a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs b/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs index 70dbc4ee48c9..91049d263ac1 100644 --- a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs @@ -25,38 +25,23 @@ use common_expression::Column; use common_expression::DataBlock; use common_expression::DataSchema; use common_expression::Evaluator; -use common_expression::RemoteExpr; +use common_expression::Expr; use common_expression::TableSchemaRef; -use common_expression::TableSchemaRefExt; use common_expression::Value; use common_functions::scalars::BUILTIN_FUNCTIONS; -use common_storages_common::blocks_to_parquet; -use common_storages_table_meta::meta::BlockMeta; use common_storages_table_meta::meta::ClusterStatistics; -use common_storages_table_meta::table::TableCompression; -use opendal::Operator; use crate::fuse_part::FusePartInfo; -use crate::io::write_data; use crate::io::BlockReader; use crate::io::ReadSettings; -use crate::io::TableMetaLocationGenerator; use crate::operations::mutation::DataChunks; -use crate::operations::mutation::Mutation; use crate::operations::mutation::MutationPartInfo; -use crate::operations::mutation::MutationTransformMeta; -use crate::operations::mutation::SerializeState; -use crate::operations::util; -use crate::operations::BloomIndexState; +use crate::operations::mutation::SerializeDataMeta; use crate::pipelines::processors::port::OutputPort; use crate::pipelines::processors::processor::Event; use crate::pipelines::processors::processor::ProcessorPtr; use crate::pipelines::processors::Processor; use crate::pruning::BlockIndex; -use crate::statistics::gen_columns_statistics; -use crate::statistics::ClusterStatsGenerator; -use crate::FuseTable; -use crate::Table; enum State { ReadData(Option), @@ -72,60 +57,43 @@ enum State { data_block: DataBlock, filter: Value, }, - NeedSerialize(DataBlock), - Serialized(SerializeState, Arc), - Generated(Mutation), Output(Option, DataBlock), Finish, } pub struct DeletionSource { state: State, - ctx: Arc, output: Arc, - location_gen: TableMetaLocationGenerator, - dal: Operator, + + ctx: Arc, + filter: Arc, block_reader: Arc, - filter: Arc>, remain_reader: Arc>, - source_schema: TableSchemaRef, output_schema: TableSchemaRef, index: BlockIndex, - cluster_stats_gen: ClusterStatsGenerator, origin_stats: Option, - table_compression: TableCompression, } impl DeletionSource { pub fn try_create( ctx: Arc, output: Arc, - table: &FuseTable, + filter: Arc, block_reader: Arc, - filter: Arc>, remain_reader: Arc>, + output_schema: TableSchemaRef, ) -> Result { - let mut srouce_fields = block_reader.schema().fields().clone(); - if let Some(remain_reader) = remain_reader.as_ref() { - srouce_fields.extend_from_slice(remain_reader.schema().fields()); - } - let source_schema = TableSchemaRefExt::create(srouce_fields); Ok(ProcessorPtr::create(Box::new(DeletionSource { state: State::ReadData(None), - ctx: ctx.clone(), output, - location_gen: table.meta_location_generator().clone(), - dal: table.get_operator(), - block_reader, + ctx: ctx.clone(), filter, + block_reader, remain_reader, - source_schema, - output_schema: table.schema(), + output_schema, index: (0, 0), - cluster_stats_gen: table.cluster_stats_gen(ctx)?, origin_stats: None, - table_compression: table.table_compression, }))) } } @@ -175,10 +143,7 @@ impl Processor for DeletionSource { } } - if matches!( - self.state, - State::ReadData(_) | State::ReadRemain { .. } | State::Serialized(_, _) - ) { + if matches!(self.state, State::ReadData(_) | State::ReadRemain { .. }) { Ok(Event::Async) } else { Ok(Event::Sync) @@ -188,20 +153,16 @@ impl Processor for DeletionSource { fn process(&mut self) -> Result<()> { match std::mem::replace(&mut self.state, State::Finish) { State::FilterData(part, chunks) => { - let data_block = self + let mut data_block = self .block_reader .deserialize_parquet_chunks(part.clone(), chunks)?; let func_ctx = self.ctx.try_get_function_context()?; let evaluator = Evaluator::new(&data_block, func_ctx, &BUILTIN_FUNCTIONS); - let expr = self - .filter - .as_expr(&BUILTIN_FUNCTIONS) - .unwrap() - .project_column_ref(|name| self.source_schema.index_of(name).unwrap()); - let res = evaluator.run(&expr).map_err(|(_, e)| { - ErrorCode::Internal(format!("eval try eval const failed: {}.", e)) - })?; + + let res = evaluator + .run(&self.filter) + .map_err(|(_, e)| ErrorCode::Internal(format!("eval filter failed: {}.", e)))?; let predicates = DataBlock::cast_to_nonull_boolean(&res).ok_or_else(|| { ErrorCode::BadArguments( "Result of filter expression cannot be converted to boolean.", @@ -212,15 +173,21 @@ impl Processor for DeletionSource { let filter = Value::Column(Column::Boolean(predicate_col.not())); if !DataBlock::filter_exists(&filter)? { // all the rows should be removed. - self.state = State::Generated(Mutation::Deleted); + let meta = SerializeDataMeta::create(self.index, self.origin_stats); + self.state = + State::Output(self.ctx.try_get_part(), DataBlock::empty_with_meta(meta)); } else { let num_rows = data_block.num_rows(); - let data_block = data_block.filter(&filter)?; + data_block = data_block.filter(&filter)?; if data_block.num_rows() == num_rows { // none of the rows should be removed. - self.state = State::Generated(Mutation::DoNothing); + self.state = State::Output(self.ctx.try_get_part(), DataBlock::empty()); } else if self.remain_reader.is_none() { - self.state = State::NeedSerialize(data_block); + let meta = SerializeDataMeta::create(self.index, self.origin_stats); + self.state = State::Output( + self.ctx.try_get_part(), + data_block.add_meta(Some(meta))?, + ); } else { self.state = State::ReadRemain { part, @@ -255,66 +222,9 @@ impl Processor for DeletionSource { let src_schema = DataSchema::new(fields); let dest_schema = self.output_schema.clone().into(); let block = merged.resort(&src_schema, &dest_schema)?; - self.state = State::NeedSerialize(block); - } - State::NeedSerialize(block) => { - let cluster_stats = self - .cluster_stats_gen - .gen_with_origin_stats(&block, std::mem::take(&mut self.origin_stats))?; - - let row_count = block.num_rows() as u64; - let block_size = block.memory_size() as u64; - let (block_location, block_id) = self.location_gen.gen_block_location(); - - // build block index. - let location = self.location_gen.block_bloom_index_location(&block_id); - let (bloom_index_state, column_distinct_count) = BloomIndexState::try_create( - self.ctx.clone(), - self.source_schema.clone(), - &block, - location, - )?; - let col_stats = gen_columns_statistics(&block, Some(column_distinct_count))?; - // serialize data block. - let mut block_data = Vec::with_capacity(100 * 1024 * 1024); - let schema = self.source_schema.clone(); - let (file_size, meta_data) = blocks_to_parquet( - &schema, - vec![block], - &mut block_data, - self.table_compression, - )?; - let col_metas = util::column_metas(&meta_data)?; - - // new block meta. - let new_meta = Arc::new(BlockMeta::new( - row_count, - block_size, - file_size, - col_stats, - col_metas, - cluster_stats, - block_location.clone(), - Some(bloom_index_state.location.clone()), - bloom_index_state.size, - self.table_compression.into(), - )); - - self.state = State::Serialized( - SerializeState { - block_data, - block_location: block_location.0, - index_data: bloom_index_state.data, - index_location: bloom_index_state.location.0, - }, - new_meta, - ); - } - State::Generated(op) => { - let meta = MutationTransformMeta::create(self.index, op); - let new_part = self.ctx.try_get_part(); - self.state = State::Output(new_part, DataBlock::empty_with_meta(meta)); + let meta = SerializeDataMeta::create(self.index, self.origin_stats); + self.state = State::Output(self.ctx.try_get_part(), block.add_meta(Some(meta))?); } _ => return Err(ErrorCode::Internal("It's a bug.")), } @@ -379,23 +289,6 @@ impl Processor for DeletionSource { return Err(ErrorCode::Internal("It's a bug. No remain reader")); } } - State::Serialized(serialize_state, block_meta) => { - // write block data. - write_data( - &serialize_state.block_data, - &self.dal, - &serialize_state.block_location, - ) - .await?; - // write index data. - write_data( - &serialize_state.index_data, - &self.dal, - &serialize_state.index_location, - ) - .await?; - self.state = State::Generated(Mutation::Replaced(block_meta)); - } _ => return Err(ErrorCode::Internal("It's a bug.")), } Ok(()) diff --git a/src/query/storages/fuse/src/operations/mutation/mod.rs b/src/query/storages/fuse/src/operations/mutation/mod.rs index 1de8091b487b..af9ed4a2a896 100644 --- a/src/query/storages/fuse/src/operations/mutation/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/mod.rs @@ -42,8 +42,11 @@ pub use mutation_meta::MutationTransformMeta; pub use mutation_meta::SerializeDataMeta; pub use mutation_part::MutationPartInfo; pub use mutation_sink::MutationSink; +pub use mutation_source::MutationAction; +pub use mutation_source::MutationSource; pub use mutation_transform::MutationTransform; pub use recluster_mutator::ReclusterMutator; +pub use serialize_data_transform::SerializeDataTransform; pub use update::UpdateSource; pub use util::DataChunks; pub use util::SerializeState; diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs b/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs index 77fc0ac8d712..12eac1a2893b 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs @@ -86,7 +86,7 @@ pub struct MutationTransformMeta { pub op: Mutation, } -#[typetag::serde(name = "mutation_source_meta")] +#[typetag::serde(name = "mutation_transform_meta")] impl BlockMetaInfo for MutationTransformMeta { fn as_any(&self) -> &dyn Any { self diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_source.rs b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs index 6f4db9e57279..50a32d4d3507 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs @@ -13,38 +13,110 @@ // limitations under the License. use std::any::Any; +use std::ops::Not; use std::sync::Arc; -use common_base::base::tokio; +use common_base::base::Progress; +use common_base::base::ProgressValues; use common_catalog::plan::PartInfoPtr; use common_catalog::table_context::TableContext; -use common_datablocks::DataBlock; +use common_exception::ErrorCode; use common_exception::Result; +use common_expression::types::AnyType; +use common_expression::types::DataType; +use common_expression::BlockEntry; +use common_expression::Column; +use common_expression::DataBlock; +use common_expression::Evaluator; +use common_expression::Expr; +use common_expression::Value; +use common_functions::scalars::BUILTIN_FUNCTIONS; +use common_sql::evaluator::BlockOperator; +use common_storages_table_meta::meta::ClusterStatistics; use crate::fuse_part::FusePartInfo; use crate::io::BlockReader; use crate::io::ReadSettings; +use crate::operations::mutation::DataChunks; use crate::operations::mutation::MutationPartInfo; -use crate::operations::read::DataSourceMeta; +use crate::operations::mutation::SerializeDataMeta; use crate::pipelines::processors::port::OutputPort; use crate::pipelines::processors::processor::Event; +use crate::pipelines::processors::processor::ProcessorPtr; use crate::pipelines::processors::Processor; -use crate::MergeIOReadResult; +use crate::pruning::BlockIndex; + +pub enum MutationAction { + Deletion, + Update, +} + +enum State { + ReadData(Option), + FilterData(PartInfoPtr, DataChunks), + ReadRemain { + part: PartInfoPtr, + data_block: DataBlock, + filter: Value, + }, + MergeRemain { + part: PartInfoPtr, + chunks: DataChunks, + data_block: DataBlock, + filter: Value, + }, + PerformOperator(DataBlock), + Output(Option, DataBlock), + Finish, +} pub struct MutationSource { - finished: bool, + state: State, + output: Arc, + scan_progress: Arc, + ctx: Arc, - batch_size: usize, + filter: Arc>, block_reader: Arc, + remain_reader: Arc>, + operators: Vec, + action: MutationAction, - output: Arc, - output_data: Option<(Vec, Vec)>, + index: BlockIndex, + origin_stats: Option, +} + +impl MutationSource { + pub fn try_create( + ctx: Arc, + action: MutationAction, + output: Arc, + filter: Arc>, + block_reader: Arc, + remain_reader: Arc>, + operators: Vec, + ) -> Result { + let scan_progress = ctx.get_scan_progress(); + Ok(ProcessorPtr::create(Box::new(MutationSource { + state: State::ReadData(None), + output, + scan_progress, + ctx: ctx.clone(), + filter, + block_reader, + remain_reader, + operators, + action, + index: (0, 0), + origin_stats: None, + }))) + } } #[async_trait::async_trait] impl Processor for MutationSource { fn name(&self) -> String { - String::from("MutationSource") + "MutationSource".to_string() } fn as_any(&mut self) -> &mut dyn Any { @@ -52,7 +124,14 @@ impl Processor for MutationSource { } fn event(&mut self) -> Result { - if self.finished { + if matches!(self.state, State::ReadData(None)) { + self.state = match self.ctx.try_get_part() { + None => State::Finish, + Some(part) => State::ReadData(Some(part)), + } + } + + if matches!(self.state, State::Finish) { self.output.finish(); return Ok(Event::Finished); } @@ -65,47 +144,235 @@ impl Processor for MutationSource { return Ok(Event::NeedConsume); } - if let Some((part, data)) = self.output_data.take() { - let output = DataBlock::empty_with_meta(DataSourceMeta::create(part, data)); - self.output.push_data(Ok(output)); + if matches!(self.state, State::Output(_, _)) { + if let State::Output(part, data_block) = + std::mem::replace(&mut self.state, State::Finish) + { + self.state = match part { + None => State::Finish, + Some(part) => State::ReadData(Some(part)), + }; + + self.output.push_data(Ok(data_block)); + return Ok(Event::NeedConsume); + } } - Ok(Event::Async) + if matches!(self.state, State::ReadData(_) | State::ReadRemain { .. }) { + Ok(Event::Async) + } else { + Ok(Event::Sync) + } } - async fn async_process(&mut self) -> Result<()> { - let parts = self.ctx.try_get_parts(self.batch_size); + fn process(&mut self) -> Result<()> { + match std::mem::replace(&mut self.state, State::Finish) { + State::FilterData(part, chunks) => { + let mut data_block = self + .block_reader + .deserialize_parquet_chunks(part.clone(), chunks)?; + let num_rows = data_block.num_rows(); - if !parts.is_empty() { - let mut chunks = Vec::with_capacity(parts.len()); - for part in &parts { - let part = part.clone(); - let block_reader = self.block_reader.clone(); - let settings = ReadSettings::from_ctx(&self.ctx)?; + if let Some(filter) = self.filter.as_ref() { + let func_ctx = self.ctx.try_get_function_context()?; + let evaluator = Evaluator::new(&data_block, func_ctx, &BUILTIN_FUNCTIONS); + + let res = evaluator.run(filter).map_err(|(_, e)| { + ErrorCode::Internal(format!("eval filter failed: {}.", e)) + })?; + let predicates = DataBlock::cast_to_nonull_boolean(&res).ok_or_else(|| { + ErrorCode::BadArguments( + "Result of filter expression cannot be converted to boolean.", + ) + })?; + + let affect_rows = match &predicates { + Value::Scalar(v) => { + if *v { + num_rows + } else { + 0 + } + } + Value::Column(bitmap) => bitmap.len() - bitmap.unset_bits(), + }; - chunks.push(async move { - tokio::spawn(async move { - let deletion_part = MutationPartInfo::from_part(&part)?; - let fuse_part = FusePartInfo::from_part(&deletion_part.inner_part)?; - - block_reader - .read_columns_data_by_merge_io( - &settings, - &fuse_part.location, - &fuse_part.columns_meta, - ) - .await - }) - .await - .unwrap() - }); + if affect_rows != 0 { + let progress_values = ProgressValues { + rows: affect_rows, + bytes: 0, + }; + self.scan_progress.incr(&progress_values); + + match self.action { + MutationAction::Deletion => { + if affect_rows == num_rows { + // all the rows should be removed. + let meta = + SerializeDataMeta::create(self.index, self.origin_stats); + self.state = State::Output( + self.ctx.try_get_part(), + DataBlock::empty_with_meta(meta), + ); + } else { + let predicate_col = predicates.into_column().unwrap(); + let filter = + Value::Column(Column::Boolean(predicate_col.not())); + data_block = data_block.filter(&filter)?; + if self.remain_reader.is_none() { + let meta = SerializeDataMeta::create( + self.index, + self.origin_stats, + ); + self.state = State::Output( + self.ctx.try_get_part(), + data_block.add_meta(Some(meta))?, + ); + } else { + self.state = State::ReadRemain { + part, + data_block, + filter, + } + } + } + } + MutationAction::Update => { + let filter = Value::upcast(predicates); + if self.remain_reader.is_none() { + data_block.add_column(BlockEntry { + data_type: DataType::Boolean, + value: filter, + }); + self.state = State::PerformOperator(data_block); + } else { + self.state = State::ReadRemain { + part, + data_block, + filter, + }; + } + } + } + } else { + // Do nothing. + self.state = State::Output(self.ctx.try_get_part(), DataBlock::empty()); + } + } else { + let progress_values = ProgressValues { + rows: num_rows, + bytes: 0, + }; + self.scan_progress.incr(&progress_values); + self.state = State::PerformOperator(data_block); + } } + State::MergeRemain { + part, + chunks, + mut data_block, + filter, + } => { + if let Some(remain_reader) = self.remain_reader.as_ref() { + let remain_block = remain_reader.deserialize_parquet_chunks(part, chunks)?; + + match self.action { + MutationAction::Deletion => { + let remain_block = remain_block.filter(&filter)?; + for col in remain_block.columns() { + data_block.add_column(col.clone()); + } + } + MutationAction::Update => { + for col in remain_block.columns() { + data_block.add_column(col.clone()); + } + data_block.add_column(BlockEntry { + data_type: DataType::Boolean, + value: filter, + }); + } + } + data_block + } else { + return Err(ErrorCode::Internal("It's a bug. Need remain reader")); + }; - self.output_data = Some((parts, futures::future::try_join_all(chunks).await?)); - return Ok(()); + self.state = State::PerformOperator(data_block); + } + State::PerformOperator(data_block) => { + let func_ctx = self.ctx.try_get_function_context()?; + let block = self + .operators + .iter() + .try_fold(data_block, |input, op| op.execute(&func_ctx, input))?; + let meta = SerializeDataMeta::create(self.index, self.origin_stats); + self.state = State::Output(self.ctx.try_get_part(), block.add_meta(Some(meta))?); + } + _ => return Err(ErrorCode::Internal("It's a bug.")), } + Ok(()) + } - self.finished = true; + async fn async_process(&mut self) -> Result<()> { + match std::mem::replace(&mut self.state, State::Finish) { + State::ReadData(Some(part)) => { + let settings = ReadSettings::from_ctx(&self.ctx)?; + let part = MutationPartInfo::from_part(&part)?; + self.index = part.index; + self.origin_stats = part.cluster_stats.clone(); + let inner_part = part.inner_part.clone(); + let fuse_part = FusePartInfo::from_part(&inner_part)?; + + let read_res = self + .block_reader + .read_columns_data_by_merge_io( + &settings, + &fuse_part.location, + &fuse_part.columns_meta, + ) + .await?; + let chunks = read_res + .columns_chunks()? + .into_iter() + .map(|(column_idx, column_chunk)| (column_idx, column_chunk.to_vec())) + .collect::>(); + self.state = State::FilterData(inner_part, chunks); + } + State::ReadRemain { + part, + data_block, + filter, + } => { + if let Some(remain_reader) = self.remain_reader.as_ref() { + let fuse_part = FusePartInfo::from_part(&part)?; + + let settings = ReadSettings::from_ctx(&self.ctx)?; + let read_res = remain_reader + .read_columns_data_by_merge_io( + &settings, + &fuse_part.location, + &fuse_part.columns_meta, + ) + .await?; + let chunks = read_res + .columns_chunks()? + .into_iter() + .map(|(column_idx, column_chunk)| (column_idx, column_chunk.to_vec())) + .collect::>(); + + self.state = State::MergeRemain { + part, + chunks, + data_block, + filter, + }; + } else { + return Err(ErrorCode::Internal("It's a bug. No remain reader")); + } + } + _ => return Err(ErrorCode::Internal("It's a bug.")), + } Ok(()) } } diff --git a/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs b/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs index 7991904d2dd3..a8366e94ff5a 100644 --- a/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs +++ b/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs @@ -15,10 +15,14 @@ use std::any::Any; use std::sync::Arc; -use common_datablocks::DataBlock; +use common_catalog::table::Table; +use common_catalog::table_context::TableContext; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::DataBlock; +use common_expression::TableSchemaRef; use common_pipeline_core::processors::port::InputPort; +use common_pipeline_core::processors::processor::ProcessorPtr; use common_storages_common::blocks_to_parquet; use common_storages_table_meta::meta::BlockMeta; use common_storages_table_meta::meta::ClusterStatistics; @@ -39,6 +43,7 @@ use crate::pipelines::processors::Processor; use crate::pruning::BlockIndex; use crate::statistics::gen_columns_statistics; use crate::statistics::ClusterStatsGenerator; +use crate::FuseTable; enum State { Consume, @@ -49,6 +54,7 @@ enum State { pub struct SerializeDataTransform { state: State, + ctx: Arc, input: Arc, output: Arc, output_data: Option, @@ -57,11 +63,37 @@ pub struct SerializeDataTransform { dal: Operator, cluster_stats_gen: ClusterStatsGenerator, + schema: TableSchemaRef, index: BlockIndex, origin_stats: Option, table_compression: TableCompression, } +impl SerializeDataTransform { + pub fn try_create( + ctx: Arc, + input: Arc, + output: Arc, + table: &FuseTable, + cluster_stats_gen: ClusterStatsGenerator, + ) -> Result { + Ok(ProcessorPtr::create(Box::new(SerializeDataTransform { + state: State::Consume, + ctx: ctx.clone(), + input, + output, + output_data: None, + location_gen: table.meta_location_generator().clone(), + dal: table.get_operator(), + cluster_stats_gen, + schema: table.schema(), + index: (0, 0), + origin_stats: None, + table_compression: table.table_compression, + }))) + } +} + #[async_trait::async_trait] impl Processor for SerializeDataTransform { fn name(&self) -> String { @@ -135,13 +167,17 @@ impl Processor for SerializeDataTransform { // build block index. let location = self.location_gen.block_bloom_index_location(&block_id); - let (bloom_index_state, column_distinct_count) = - BloomIndexState::try_create(&block, location)?; + let (bloom_index_state, column_distinct_count) = BloomIndexState::try_create( + self.ctx.clone(), + self.schema.clone(), + &block, + location, + )?; let col_stats = gen_columns_statistics(&block, Some(column_distinct_count))?; // serialize data block. let mut block_data = Vec::with_capacity(100 * 1024 * 1024); - let schema = block.schema().clone(); + let schema = self.schema.clone(); let (file_size, meta_data) = blocks_to_parquet( &schema, vec![block], diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs index b7831243f6e8..2cd204889ebb 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs @@ -13,42 +13,44 @@ // limitations under the License. use std::any::Any; +use std::ops::Not; use std::sync::Arc; use common_base::base::Progress; use common_base::base::ProgressValues; use common_catalog::plan::PartInfoPtr; use common_catalog::table_context::TableContext; -use common_datablocks::DataBlock; -use common_datavalues::prelude::*; use common_exception::ErrorCode; use common_exception::Result; -use common_sql::evaluator::ChunkOperator; -use common_sql::evaluator::EvalNode; -use common_storages_common::blocks_to_parquet; -use common_storages_table_meta::meta::BlockMeta; -use common_storages_table_meta::table::TableCompression; -use opendal::Operator; +use common_expression::types::AnyType; +use common_expression::types::DataType; +use common_expression::BlockEntry; +use common_expression::Column; +use common_expression::DataBlock; +use common_expression::Evaluator; +use common_expression::Expr; +use common_expression::TableSchemaRef; +use common_expression::Value; +use common_functions::scalars::BUILTIN_FUNCTIONS; +use common_sql::evaluator::BlockOperator; +use common_storages_table_meta::meta::ClusterStatistics; use crate::fuse_part::FusePartInfo; -use crate::io::write_data; use crate::io::BlockReader; use crate::io::ReadSettings; -use crate::io::TableMetaLocationGenerator; use crate::operations::mutation::DataChunks; -use crate::operations::mutation::Mutation; use crate::operations::mutation::MutationPartInfo; -use crate::operations::mutation::MutationTransformMeta; -use crate::operations::mutation::SerializeState; -use crate::operations::util; -use crate::operations::BloomIndexState; +use crate::operations::mutation::SerializeDataMeta; use crate::pipelines::processors::port::OutputPort; use crate::pipelines::processors::processor::Event; use crate::pipelines::processors::processor::ProcessorPtr; use crate::pipelines::processors::Processor; use crate::pruning::BlockIndex; -use crate::statistics::gen_columns_statistics; -use crate::FuseTable; + +pub enum MutationOperator { + Deletion, + Update, +} enum State { ReadData(Option), @@ -56,63 +58,62 @@ enum State { ReadRemain { part: PartInfoPtr, data_block: DataBlock, - filter: ColumnRef, + filter: Value, }, MergeRemain { part: PartInfoPtr, chunks: DataChunks, data_block: DataBlock, - filter: ColumnRef, + filter: Value, }, - UpdateData(DataBlock), - NeedSerialize(DataBlock), - Serialized(SerializeState, Arc), - Generated(Mutation), + PerformOperator(DataBlock), Output(Option, DataBlock), Finish, } pub struct UpdateSource { state: State, - ctx: Arc, - scan_progress: Arc, output: Arc, - location_gen: TableMetaLocationGenerator, - dal: Operator, - table_compression: TableCompression, + scan_progress: Arc, + ctx: Arc, + filter: Arc>, block_reader: Arc, - filter: Arc>, remain_reader: Arc>, - operators: Vec, + operators: Vec, + mutation: MutationOperator, + output_schema: TableSchemaRef, index: BlockIndex, + origin_stats: Option, } impl UpdateSource { + #![allow(clippy::too_many_arguments)] pub fn try_create( ctx: Arc, + mutation: MutationOperator, output: Arc, - table: &FuseTable, + filter: Arc>, block_reader: Arc, - filter: Arc>, remain_reader: Arc>, - operators: Vec, + operators: Vec, + output_schema: TableSchemaRef, ) -> Result { let scan_progress = ctx.get_scan_progress(); Ok(ProcessorPtr::create(Box::new(UpdateSource { state: State::ReadData(None), - ctx: ctx.clone(), - scan_progress, output, - location_gen: table.meta_location_generator().clone(), - dal: table.get_operator(), - table_compression: table.table_compression, - block_reader, + scan_progress, + ctx: ctx.clone(), filter, + block_reader, remain_reader, operators, + mutation, + output_schema, index: (0, 0), + origin_stats: None, }))) } } @@ -162,10 +163,7 @@ impl Processor for UpdateSource { } } - if matches!( - self.state, - State::ReadData(_) | State::ReadRemain { .. } | State::Serialized(_, _) - ) { + if matches!(self.state, State::ReadData(_) | State::ReadRemain { .. }) { Ok(Event::Async) } else { Ok(Event::Sync) @@ -175,46 +173,103 @@ impl Processor for UpdateSource { fn process(&mut self) -> Result<()> { match std::mem::replace(&mut self.state, State::Finish) { State::FilterData(part, chunks) => { - let data_block = self + let mut data_block = self .block_reader .deserialize_parquet_chunks(part.clone(), chunks)?; + let num_rows = data_block.num_rows(); + if let Some(filter) = self.filter.as_ref() { - let filter_result = filter - .eval(&self.ctx.try_get_function_context()?, &data_block)? - .vector; - let filter = DataBlock::cast_to_nonull_boolean(&filter_result)?; - if DataBlock::filter_exists(&filter)? { - let col: &BooleanColumn = Series::check_get(&filter)?; + let func_ctx = self.ctx.try_get_function_context()?; + let evaluator = Evaluator::new(&data_block, func_ctx, &BUILTIN_FUNCTIONS); + + let res = evaluator.run(filter).map_err(|(_, e)| { + ErrorCode::Internal(format!("eval filter failed: {}.", e)) + })?; + let predicates = DataBlock::cast_to_nonull_boolean(&res).ok_or_else(|| { + ErrorCode::BadArguments( + "Result of filter expression cannot be converted to boolean.", + ) + })?; + + let affect_rows = match &predicates { + Value::Scalar(v) => { + if *v { + num_rows + } else { + 0 + } + } + Value::Column(bitmap) => bitmap.len() - bitmap.unset_bits(), + }; + + if affect_rows != 0 { let progress_values = ProgressValues { - rows: col.len() - col.values().unset_bits(), + rows: affect_rows, bytes: 0, }; self.scan_progress.incr(&progress_values); - if self.remain_reader.is_none() { - self.state = State::MergeRemain { - part, - chunks: vec![], - data_block, - filter, - }; - } else { - self.state = State::ReadRemain { - part, - data_block, - filter, - }; + match self.mutation { + MutationOperator::Deletion => { + if affect_rows == num_rows { + // all the rows should be removed. + let meta = + SerializeDataMeta::create(self.index, self.origin_stats); + self.state = State::Output( + self.ctx.try_get_part(), + DataBlock::empty_with_meta(meta), + ); + } else { + let predicate_col = predicates.into_column().unwrap(); + let filter = + Value::Column(Column::Boolean(predicate_col.not())); + data_block = data_block.filter(&filter)?; + if self.remain_reader.is_none() { + let meta = SerializeDataMeta::create( + self.index, + self.origin_stats, + ); + self.state = State::Output( + self.ctx.try_get_part(), + data_block.add_meta(Some(meta))?, + ); + } else { + self.state = State::ReadRemain { + part, + data_block, + filter, + } + } + } + } + MutationOperator::Update => { + let filter = Value::upcast(predicates); + if self.remain_reader.is_none() { + data_block.add_column(BlockEntry { + data_type: DataType::Boolean, + value: filter, + }); + self.state = State::PerformOperator(data_block); + } else { + self.state = State::ReadRemain { + part, + data_block, + filter, + }; + } + } } } else { - self.state = State::Generated(Mutation::DoNothing); + // Do nothing. + self.state = State::Output(self.ctx.try_get_part(), DataBlock::empty()); } } else { let progress_values = ProgressValues { - rows: data_block.num_rows(), + rows: num_rows, bytes: 0, }; self.scan_progress.incr(&progress_values); - self.state = State::UpdateData(data_block); + self.state = State::PerformOperator(data_block); } } State::MergeRemain { @@ -225,78 +280,39 @@ impl Processor for UpdateSource { } => { if let Some(remain_reader) = self.remain_reader.as_ref() { let remain_block = remain_reader.deserialize_parquet_chunks(part, chunks)?; - for (col, field) in remain_block - .columns() - .iter() - .zip(remain_block.schema().fields()) - { - data_block = data_block.add_column(col.clone(), field.clone())?; - } - } - let field = DataField::new("_predicate", bool::to_data_type()); - data_block = data_block.add_column(filter, field)?; + match self.mutation { + MutationOperator::Deletion => { + let remain_block = remain_block.filter(&filter)?; + for col in remain_block.columns() { + data_block.add_column(col.clone()); + } + } + MutationOperator::Update => { + for col in remain_block.columns() { + data_block.add_column(col.clone()); + } + data_block.add_column(BlockEntry { + data_type: DataType::Boolean, + value: filter, + }); + } + } + data_block + } else { + return Err(ErrorCode::Internal("It's a bug. Need remain reader")); + }; - self.state = State::UpdateData(data_block); + self.state = State::PerformOperator(data_block); } - State::UpdateData(data_block) => { + State::PerformOperator(data_block) => { let func_ctx = self.ctx.try_get_function_context()?; let block = self .operators .iter() .try_fold(data_block, |input, op| op.execute(&func_ctx, input))?; - self.state = State::NeedSerialize(block); - } - State::NeedSerialize(block) => { - let row_count = block.num_rows() as u64; - let block_size = block.memory_size() as u64; - let (block_location, block_id) = self.location_gen.gen_block_location(); - - // build block index. - let location = self.location_gen.block_bloom_index_location(&block_id); - let (bloom_index_state, column_distinct_count) = - BloomIndexState::try_create(&block, location)?; - let col_stats = gen_columns_statistics(&block, Some(column_distinct_count))?; - - // serialize data block. - let mut block_data = Vec::with_capacity(100 * 1024 * 1024); - let schema = block.schema().clone(); - let (file_size, meta_data) = blocks_to_parquet( - &schema, - vec![block], - &mut block_data, - self.table_compression, - )?; - let col_metas = util::column_metas(&meta_data)?; - - // new block meta. - let new_meta = Arc::new(BlockMeta::new( - row_count, - block_size, - file_size, - col_stats, - col_metas, - None, - block_location.clone(), - Some(bloom_index_state.location.clone()), - bloom_index_state.size, - self.table_compression.into(), - )); - - self.state = State::Serialized( - SerializeState { - block_data, - block_location: block_location.0, - index_data: bloom_index_state.data, - index_location: bloom_index_state.location.0, - }, - new_meta, - ); - } - State::Generated(op) => { - let meta = MutationTransformMeta::create(self.index, op); - let new_part = self.ctx.try_get_part(); - self.state = State::Output(new_part, DataBlock::empty_with_meta(meta)); + let meta = SerializeDataMeta::create(self.index, self.origin_stats); + self.state = State::Output(self.ctx.try_get_part(), block.add_meta(Some(meta))?); } _ => return Err(ErrorCode::Internal("It's a bug.")), } @@ -309,6 +325,7 @@ impl Processor for UpdateSource { let settings = ReadSettings::from_ctx(&self.ctx)?; let part = MutationPartInfo::from_part(&part)?; self.index = part.index; + self.origin_stats = part.cluster_stats.clone(); let inner_part = part.inner_part.clone(); let fuse_part = FusePartInfo::from_part(&inner_part)?; @@ -359,23 +376,6 @@ impl Processor for UpdateSource { return Err(ErrorCode::Internal("It's a bug. No remain reader")); } } - State::Serialized(serialize_state, block_meta) => { - // write block data. - write_data( - &serialize_state.block_data, - &self.dal, - &serialize_state.block_location, - ) - .await?; - // write index data. - write_data( - &serialize_state.index_data, - &self.dal, - &serialize_state.index_location, - ) - .await?; - self.state = State::Generated(Mutation::Replaced(block_meta)); - } _ => return Err(ErrorCode::Internal("It's a bug.")), } Ok(()) From b49199db9a3d62a997b8b9b61187a227ac0902e6 Mon Sep 17 00:00:00 2001 From: zhyass Date: Wed, 11 Jan 2023 01:26:52 +0800 Subject: [PATCH 17/26] remove deletion source --- .../storages/fuse/src/operations/delete.rs | 2 +- .../mutation/deletion/deletion_source.rs | 296 ------------------ .../src/operations/mutation/deletion/mod.rs | 17 - .../fuse/src/operations/mutation/mod.rs | 2 - 4 files changed, 1 insertion(+), 316 deletions(-) delete mode 100644 src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs delete mode 100644 src/query/storages/fuse/src/operations/mutation/deletion/mod.rs diff --git a/src/query/storages/fuse/src/operations/delete.rs b/src/query/storages/fuse/src/operations/delete.rs index 93e9082db99f..4f0f78275153 100644 --- a/src/query/storages/fuse/src/operations/delete.rs +++ b/src/query/storages/fuse/src/operations/delete.rs @@ -39,12 +39,12 @@ use common_sql::evaluator::BlockOperator; use common_storages_table_meta::meta::Location; use common_storages_table_meta::meta::TableSnapshot; -use super::mutation::SerializeDataTransform; use crate::operations::mutation::MutationAction; use crate::operations::mutation::MutationPartInfo; use crate::operations::mutation::MutationSink; use crate::operations::mutation::MutationSource; use crate::operations::mutation::MutationTransform; +use crate::operations::mutation::SerializeDataTransform; use crate::pipelines::processors::port::InputPort; use crate::pipelines::processors::port::OutputPort; use crate::pipelines::Pipe; diff --git a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs b/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs deleted file mode 100644 index 91049d263ac1..000000000000 --- a/src/query/storages/fuse/src/operations/mutation/deletion/deletion_source.rs +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright 2021 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::ops::Not; -use std::sync::Arc; - -use common_catalog::plan::PartInfoPtr; -use common_catalog::table_context::TableContext; -use common_exception::ErrorCode; -use common_exception::Result; -use common_expression::types::AnyType; -use common_expression::Column; -use common_expression::DataBlock; -use common_expression::DataSchema; -use common_expression::Evaluator; -use common_expression::Expr; -use common_expression::TableSchemaRef; -use common_expression::Value; -use common_functions::scalars::BUILTIN_FUNCTIONS; -use common_storages_table_meta::meta::ClusterStatistics; - -use crate::fuse_part::FusePartInfo; -use crate::io::BlockReader; -use crate::io::ReadSettings; -use crate::operations::mutation::DataChunks; -use crate::operations::mutation::MutationPartInfo; -use crate::operations::mutation::SerializeDataMeta; -use crate::pipelines::processors::port::OutputPort; -use crate::pipelines::processors::processor::Event; -use crate::pipelines::processors::processor::ProcessorPtr; -use crate::pipelines::processors::Processor; -use crate::pruning::BlockIndex; - -enum State { - ReadData(Option), - FilterData(PartInfoPtr, DataChunks), - ReadRemain { - part: PartInfoPtr, - data_block: DataBlock, - filter: Value, - }, - MergeRemain { - part: PartInfoPtr, - chunks: DataChunks, - data_block: DataBlock, - filter: Value, - }, - Output(Option, DataBlock), - Finish, -} - -pub struct DeletionSource { - state: State, - output: Arc, - - ctx: Arc, - filter: Arc, - block_reader: Arc, - remain_reader: Arc>, - - output_schema: TableSchemaRef, - index: BlockIndex, - origin_stats: Option, -} - -impl DeletionSource { - pub fn try_create( - ctx: Arc, - output: Arc, - filter: Arc, - block_reader: Arc, - remain_reader: Arc>, - output_schema: TableSchemaRef, - ) -> Result { - Ok(ProcessorPtr::create(Box::new(DeletionSource { - state: State::ReadData(None), - output, - ctx: ctx.clone(), - filter, - block_reader, - remain_reader, - output_schema, - index: (0, 0), - origin_stats: None, - }))) - } -} - -#[async_trait::async_trait] -impl Processor for DeletionSource { - fn name(&self) -> String { - "DeletionSource".to_string() - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn event(&mut self) -> Result { - if matches!(self.state, State::ReadData(None)) { - self.state = match self.ctx.try_get_part() { - None => State::Finish, - Some(part) => State::ReadData(Some(part)), - } - } - - if matches!(self.state, State::Finish) { - self.output.finish(); - return Ok(Event::Finished); - } - - if self.output.is_finished() { - return Ok(Event::Finished); - } - - if !self.output.can_push() { - return Ok(Event::NeedConsume); - } - - if matches!(self.state, State::Output(_, _)) { - if let State::Output(part, data_block) = - std::mem::replace(&mut self.state, State::Finish) - { - self.state = match part { - None => State::Finish, - Some(part) => State::ReadData(Some(part)), - }; - - self.output.push_data(Ok(data_block)); - return Ok(Event::NeedConsume); - } - } - - if matches!(self.state, State::ReadData(_) | State::ReadRemain { .. }) { - Ok(Event::Async) - } else { - Ok(Event::Sync) - } - } - - fn process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Finish) { - State::FilterData(part, chunks) => { - let mut data_block = self - .block_reader - .deserialize_parquet_chunks(part.clone(), chunks)?; - - let func_ctx = self.ctx.try_get_function_context()?; - let evaluator = Evaluator::new(&data_block, func_ctx, &BUILTIN_FUNCTIONS); - - let res = evaluator - .run(&self.filter) - .map_err(|(_, e)| ErrorCode::Internal(format!("eval filter failed: {}.", e)))?; - let predicates = DataBlock::cast_to_nonull_boolean(&res).ok_or_else(|| { - ErrorCode::BadArguments( - "Result of filter expression cannot be converted to boolean.", - ) - })?; - - let predicate_col = predicates.into_column().unwrap(); - let filter = Value::Column(Column::Boolean(predicate_col.not())); - if !DataBlock::filter_exists(&filter)? { - // all the rows should be removed. - let meta = SerializeDataMeta::create(self.index, self.origin_stats); - self.state = - State::Output(self.ctx.try_get_part(), DataBlock::empty_with_meta(meta)); - } else { - let num_rows = data_block.num_rows(); - data_block = data_block.filter(&filter)?; - if data_block.num_rows() == num_rows { - // none of the rows should be removed. - self.state = State::Output(self.ctx.try_get_part(), DataBlock::empty()); - } else if self.remain_reader.is_none() { - let meta = SerializeDataMeta::create(self.index, self.origin_stats); - self.state = State::Output( - self.ctx.try_get_part(), - data_block.add_meta(Some(meta))?, - ); - } else { - self.state = State::ReadRemain { - part, - data_block, - filter, - } - } - } - } - State::MergeRemain { - part, - chunks, - mut data_block, - filter, - } => { - let mut fields = self.block_reader.data_fields(); - let merged = if chunks.is_empty() { - data_block - } else if let Some(remain_reader) = self.remain_reader.as_ref() { - let mut remain_fields = remain_reader.data_fields(); - fields.append(&mut remain_fields); - let remain_block = remain_reader.deserialize_parquet_chunks(part, chunks)?; - let remain_block = remain_block.filter(&filter)?; - for col in remain_block.columns() { - data_block.add_column(col.clone()); - } - data_block - } else { - return Err(ErrorCode::Internal("It's a bug. Need remain reader")); - }; - - let src_schema = DataSchema::new(fields); - let dest_schema = self.output_schema.clone().into(); - let block = merged.resort(&src_schema, &dest_schema)?; - - let meta = SerializeDataMeta::create(self.index, self.origin_stats); - self.state = State::Output(self.ctx.try_get_part(), block.add_meta(Some(meta))?); - } - _ => return Err(ErrorCode::Internal("It's a bug.")), - } - Ok(()) - } - - async fn async_process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Finish) { - State::ReadData(Some(part)) => { - let settings = ReadSettings::from_ctx(&self.ctx)?; - let deletion_part = MutationPartInfo::from_part(&part)?; - self.index = deletion_part.index; - self.origin_stats = deletion_part.cluster_stats.clone(); - let part = deletion_part.inner_part.clone(); - let fuse_part = FusePartInfo::from_part(&part)?; - - let read_res = self - .block_reader - .read_columns_data_by_merge_io( - &settings, - &fuse_part.location, - &fuse_part.columns_meta, - ) - .await?; - let chunks = read_res - .columns_chunks()? - .into_iter() - .map(|(column_idx, column_chunk)| (column_idx, column_chunk.to_vec())) - .collect::>(); - - self.state = State::FilterData(part, chunks); - } - State::ReadRemain { - part, - data_block, - filter, - } => { - if let Some(remain_reader) = self.remain_reader.as_ref() { - let fuse_part = FusePartInfo::from_part(&part)?; - - let settings = ReadSettings::from_ctx(&self.ctx)?; - let read_res = remain_reader - .read_columns_data_by_merge_io( - &settings, - &fuse_part.location, - &fuse_part.columns_meta, - ) - .await?; - let chunks = read_res - .columns_chunks()? - .into_iter() - .map(|(column_idx, column_chunk)| (column_idx, column_chunk.to_vec())) - .collect::>(); - - self.state = State::MergeRemain { - part, - chunks, - data_block, - filter, - }; - } else { - return Err(ErrorCode::Internal("It's a bug. No remain reader")); - } - } - _ => return Err(ErrorCode::Internal("It's a bug.")), - } - Ok(()) - } -} diff --git a/src/query/storages/fuse/src/operations/mutation/deletion/mod.rs b/src/query/storages/fuse/src/operations/mutation/deletion/mod.rs deleted file mode 100644 index 075f941b5fb4..000000000000 --- a/src/query/storages/fuse/src/operations/mutation/deletion/mod.rs +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod deletion_source; - -pub use deletion_source::DeletionSource; diff --git a/src/query/storages/fuse/src/operations/mutation/mod.rs b/src/query/storages/fuse/src/operations/mutation/mod.rs index af9ed4a2a896..ec1c96fd39c7 100644 --- a/src/query/storages/fuse/src/operations/mutation/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/mod.rs @@ -15,7 +15,6 @@ pub mod abort_operation; pub mod base_mutator; mod compact; -mod deletion; pub mod mutation_meta; mod mutation_part; pub mod mutation_sink; @@ -35,7 +34,6 @@ pub use compact::MergeSegmentsTransform; pub use compact::SegmentCompactMutator; pub use compact::SegmentCompactionState; pub use compact::SegmentCompactor; -pub use deletion::DeletionSource; pub use mutation_meta::Mutation; pub use mutation_meta::MutationSinkMeta; pub use mutation_meta::MutationTransformMeta; From 93322b6d69c26a6ddb9913bb97b8767f3de162b6 Mon Sep 17 00:00:00 2001 From: zhyass Date: Wed, 11 Jan 2023 01:41:11 +0800 Subject: [PATCH 18/26] resolve conflict --- .../service/src/interpreters/interpreter_delete.rs | 1 - .../storages/common/pruner/src/topn_pruner.rs | 2 +- .../fuse/src/operations/mutation/mutation_meta.rs | 13 ++++++++----- .../src/operations/mutation/mutation_source.rs | 8 ++++---- .../mutation/serialize_data_transform.rs | 14 +++++++------- .../operations/mutation/update/update_source.rs | 8 ++++---- 6 files changed, 24 insertions(+), 22 deletions(-) diff --git a/src/query/service/src/interpreters/interpreter_delete.rs b/src/query/service/src/interpreters/interpreter_delete.rs index 097913d8ea99..31f71a4a5460 100644 --- a/src/query/service/src/interpreters/interpreter_delete.rs +++ b/src/query/service/src/interpreters/interpreter_delete.rs @@ -17,7 +17,6 @@ use std::sync::Arc; use common_exception::Result; use common_expression::DataSchemaRef; use common_pipeline_core::Pipeline; -use common_sql::plans::DeletePlan; use crate::interpreters::Interpreter; use crate::pipelines::executor::ExecutorSettings; diff --git a/src/query/storages/common/pruner/src/topn_pruner.rs b/src/query/storages/common/pruner/src/topn_pruner.rs index f4ea31246fd8..b29712ca1c05 100644 --- a/src/query/storages/common/pruner/src/topn_pruner.rs +++ b/src/query/storages/common/pruner/src/topn_pruner.rs @@ -22,7 +22,7 @@ use common_expression::TableSchemaRef; use storages_common_table_meta::meta::BlockMeta; use storages_common_table_meta::meta::ColumnStatistics; -#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)] +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Default, PartialEq)] pub struct BlockMetaIndex { pub segment_idx: usize, pub block_idx: usize, diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs b/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs index 53b7eb7b3ca7..f079cb816e3c 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs @@ -19,17 +19,17 @@ use common_exception::ErrorCode; use common_exception::Result; use common_expression::BlockMetaInfo; use common_expression::BlockMetaInfoPtr; +use storages_common_pruner::BlockMetaIndex; use storages_common_table_meta::meta::BlockMeta; use storages_common_table_meta::meta::ClusterStatistics; use storages_common_table_meta::meta::Location; use storages_common_table_meta::meta::Statistics; use crate::operations::mutation::AbortOperation; -use crate::pruning::BlockIndex; #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)] pub struct SerializeDataMeta { - pub index: BlockIndex, + pub index: BlockMetaIndex, pub cluster_stats: Option, } @@ -56,7 +56,10 @@ impl BlockMetaInfo for SerializeDataMeta { } impl SerializeDataMeta { - pub fn create(index: BlockIndex, cluster_stats: Option) -> BlockMetaInfoPtr { + pub fn create( + index: BlockMetaIndex, + cluster_stats: Option, + ) -> BlockMetaInfoPtr { Box::new(SerializeDataMeta { index, cluster_stats, @@ -82,7 +85,7 @@ pub enum Mutation { #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq)] pub struct MutationTransformMeta { - pub index: BlockIndex, + pub index: BlockMetaIndex, pub op: Mutation, } @@ -109,7 +112,7 @@ impl BlockMetaInfo for MutationTransformMeta { } impl MutationTransformMeta { - pub fn create(index: BlockIndex, op: Mutation) -> BlockMetaInfoPtr { + pub fn create(index: BlockMetaIndex, op: Mutation) -> BlockMetaInfoPtr { Box::new(MutationTransformMeta { index, op }) } diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_source.rs b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs index 50a32d4d3507..33a3e4582d3a 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs @@ -32,7 +32,8 @@ use common_expression::Expr; use common_expression::Value; use common_functions::scalars::BUILTIN_FUNCTIONS; use common_sql::evaluator::BlockOperator; -use common_storages_table_meta::meta::ClusterStatistics; +use storages_common_pruner::BlockMetaIndex; +use storages_common_table_meta::meta::ClusterStatistics; use crate::fuse_part::FusePartInfo; use crate::io::BlockReader; @@ -44,7 +45,6 @@ use crate::pipelines::processors::port::OutputPort; use crate::pipelines::processors::processor::Event; use crate::pipelines::processors::processor::ProcessorPtr; use crate::pipelines::processors::Processor; -use crate::pruning::BlockIndex; pub enum MutationAction { Deletion, @@ -82,7 +82,7 @@ pub struct MutationSource { operators: Vec, action: MutationAction, - index: BlockIndex, + index: BlockMetaIndex, origin_stats: Option, } @@ -107,7 +107,7 @@ impl MutationSource { remain_reader, operators, action, - index: (0, 0), + index: BlockMetaIndex::default(), origin_stats: None, }))) } diff --git a/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs b/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs index a8366e94ff5a..ce54a3867463 100644 --- a/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs +++ b/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs @@ -23,11 +23,12 @@ use common_expression::DataBlock; use common_expression::TableSchemaRef; use common_pipeline_core::processors::port::InputPort; use common_pipeline_core::processors::processor::ProcessorPtr; -use common_storages_common::blocks_to_parquet; -use common_storages_table_meta::meta::BlockMeta; -use common_storages_table_meta::meta::ClusterStatistics; -use common_storages_table_meta::table::TableCompression; use opendal::Operator; +use storages_common_blocks::blocks_to_parquet; +use storages_common_pruner::BlockMetaIndex; +use storages_common_table_meta::meta::BlockMeta; +use storages_common_table_meta::meta::ClusterStatistics; +use storages_common_table_meta::table::TableCompression; use crate::io::write_data; use crate::io::TableMetaLocationGenerator; @@ -40,7 +41,6 @@ use crate::operations::BloomIndexState; use crate::pipelines::processors::port::OutputPort; use crate::pipelines::processors::processor::Event; use crate::pipelines::processors::Processor; -use crate::pruning::BlockIndex; use crate::statistics::gen_columns_statistics; use crate::statistics::ClusterStatsGenerator; use crate::FuseTable; @@ -64,7 +64,7 @@ pub struct SerializeDataTransform { cluster_stats_gen: ClusterStatsGenerator, schema: TableSchemaRef, - index: BlockIndex, + index: BlockMetaIndex, origin_stats: Option, table_compression: TableCompression, } @@ -87,7 +87,7 @@ impl SerializeDataTransform { dal: table.get_operator(), cluster_stats_gen, schema: table.schema(), - index: (0, 0), + index: BlockMetaIndex::default(), origin_stats: None, table_compression: table.table_compression, }))) diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs index 2cd204889ebb..2f06842b9d70 100644 --- a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs @@ -33,7 +33,8 @@ use common_expression::TableSchemaRef; use common_expression::Value; use common_functions::scalars::BUILTIN_FUNCTIONS; use common_sql::evaluator::BlockOperator; -use common_storages_table_meta::meta::ClusterStatistics; +use storages_common_pruner::BlockMetaIndex; +use storages_common_table_meta::meta::ClusterStatistics; use crate::fuse_part::FusePartInfo; use crate::io::BlockReader; @@ -45,7 +46,6 @@ use crate::pipelines::processors::port::OutputPort; use crate::pipelines::processors::processor::Event; use crate::pipelines::processors::processor::ProcessorPtr; use crate::pipelines::processors::Processor; -use crate::pruning::BlockIndex; pub enum MutationOperator { Deletion, @@ -84,7 +84,7 @@ pub struct UpdateSource { mutation: MutationOperator, output_schema: TableSchemaRef, - index: BlockIndex, + index: BlockMetaIndex, origin_stats: Option, } @@ -112,7 +112,7 @@ impl UpdateSource { operators, mutation, output_schema, - index: (0, 0), + index: BlockMetaIndex::default(), origin_stats: None, }))) } From cbe97ac54ab0d1aa10497b382c7c3d976803fcde Mon Sep 17 00:00:00 2001 From: zhyass Date: Wed, 11 Jan 2023 01:48:35 +0800 Subject: [PATCH 19/26] remove update source --- .../mutation/compact/compact_transform.rs | 8 +- .../fuse/src/operations/mutation/mod.rs | 6 +- .../operations/mutation/mutation_source.rs | 3 +- .../mutation/serialize_data_transform.rs | 8 +- .../src/operations/mutation/update/mod.rs | 17 - .../mutation/update/update_source.rs | 383 ------------------ .../fuse/src/operations/mutation/util.rs | 22 - 7 files changed, 11 insertions(+), 436 deletions(-) delete mode 100644 src/query/storages/fuse/src/operations/mutation/update/mod.rs delete mode 100644 src/query/storages/fuse/src/operations/mutation/update/update_source.rs delete mode 100644 src/query/storages/fuse/src/operations/mutation/util.rs diff --git a/src/query/storages/fuse/src/operations/mutation/compact/compact_transform.rs b/src/query/storages/fuse/src/operations/mutation/compact/compact_transform.rs index f020398e6490..30df168ede97 100644 --- a/src/query/storages/fuse/src/operations/mutation/compact/compact_transform.rs +++ b/src/query/storages/fuse/src/operations/mutation/compact/compact_transform.rs @@ -46,6 +46,7 @@ use crate::io::TableMetaLocationGenerator; use crate::io::WriteSettings; use crate::metrics::*; use crate::operations::mutation::AbortOperation; +use crate::operations::mutation::SerializeState; use crate::pipelines::processors::port::InputPort; use crate::pipelines::processors::port::OutputPort; use crate::pipelines::processors::processor::Event; @@ -55,13 +56,6 @@ use crate::statistics::reduce_block_statistics; use crate::statistics::reducers::reduce_block_metas; use crate::FuseStorageFormat; -struct SerializeState { - block_data: Vec, - block_location: String, - index_data: Vec, - index_location: String, -} - enum State { Consume, ReadBlocks, diff --git a/src/query/storages/fuse/src/operations/mutation/mod.rs b/src/query/storages/fuse/src/operations/mutation/mod.rs index ec1c96fd39c7..4e541a7d7261 100644 --- a/src/query/storages/fuse/src/operations/mutation/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/mod.rs @@ -22,8 +22,6 @@ mod mutation_source; mod mutation_transform; pub mod recluster_mutator; mod serialize_data_transform; -mod update; -mod util; pub use abort_operation::AbortOperation; pub use base_mutator::BaseMutator; @@ -45,6 +43,4 @@ pub use mutation_source::MutationSource; pub use mutation_transform::MutationTransform; pub use recluster_mutator::ReclusterMutator; pub use serialize_data_transform::SerializeDataTransform; -pub use update::UpdateSource; -pub use util::DataChunks; -pub use util::SerializeState; +pub use serialize_data_transform::SerializeState; diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_source.rs b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs index 33a3e4582d3a..00140ba8efcd 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs @@ -38,7 +38,6 @@ use storages_common_table_meta::meta::ClusterStatistics; use crate::fuse_part::FusePartInfo; use crate::io::BlockReader; use crate::io::ReadSettings; -use crate::operations::mutation::DataChunks; use crate::operations::mutation::MutationPartInfo; use crate::operations::mutation::SerializeDataMeta; use crate::pipelines::processors::port::OutputPort; @@ -46,6 +45,8 @@ use crate::pipelines::processors::processor::Event; use crate::pipelines::processors::processor::ProcessorPtr; use crate::pipelines::processors::Processor; +type DataChunks = Vec<(usize, Vec)>; + pub enum MutationAction { Deletion, Update, diff --git a/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs b/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs index ce54a3867463..88f9552820c9 100644 --- a/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs +++ b/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs @@ -35,7 +35,6 @@ use crate::io::TableMetaLocationGenerator; use crate::operations::mutation::Mutation; use crate::operations::mutation::MutationTransformMeta; use crate::operations::mutation::SerializeDataMeta; -use crate::operations::mutation::SerializeState; use crate::operations::util; use crate::operations::BloomIndexState; use crate::pipelines::processors::port::OutputPort; @@ -45,6 +44,13 @@ use crate::statistics::gen_columns_statistics; use crate::statistics::ClusterStatsGenerator; use crate::FuseTable; +pub struct SerializeState { + pub block_data: Vec, + pub block_location: String, + pub index_data: Vec, + pub index_location: String, +} + enum State { Consume, NeedSerialize(DataBlock), diff --git a/src/query/storages/fuse/src/operations/mutation/update/mod.rs b/src/query/storages/fuse/src/operations/mutation/update/mod.rs deleted file mode 100644 index 05b474dc0459..000000000000 --- a/src/query/storages/fuse/src/operations/mutation/update/mod.rs +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod update_source; - -pub use update_source::UpdateSource; diff --git a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs b/src/query/storages/fuse/src/operations/mutation/update/update_source.rs deleted file mode 100644 index 2f06842b9d70..000000000000 --- a/src/query/storages/fuse/src/operations/mutation/update/update_source.rs +++ /dev/null @@ -1,383 +0,0 @@ -// Copyright 2022 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::ops::Not; -use std::sync::Arc; - -use common_base::base::Progress; -use common_base::base::ProgressValues; -use common_catalog::plan::PartInfoPtr; -use common_catalog::table_context::TableContext; -use common_exception::ErrorCode; -use common_exception::Result; -use common_expression::types::AnyType; -use common_expression::types::DataType; -use common_expression::BlockEntry; -use common_expression::Column; -use common_expression::DataBlock; -use common_expression::Evaluator; -use common_expression::Expr; -use common_expression::TableSchemaRef; -use common_expression::Value; -use common_functions::scalars::BUILTIN_FUNCTIONS; -use common_sql::evaluator::BlockOperator; -use storages_common_pruner::BlockMetaIndex; -use storages_common_table_meta::meta::ClusterStatistics; - -use crate::fuse_part::FusePartInfo; -use crate::io::BlockReader; -use crate::io::ReadSettings; -use crate::operations::mutation::DataChunks; -use crate::operations::mutation::MutationPartInfo; -use crate::operations::mutation::SerializeDataMeta; -use crate::pipelines::processors::port::OutputPort; -use crate::pipelines::processors::processor::Event; -use crate::pipelines::processors::processor::ProcessorPtr; -use crate::pipelines::processors::Processor; - -pub enum MutationOperator { - Deletion, - Update, -} - -enum State { - ReadData(Option), - FilterData(PartInfoPtr, DataChunks), - ReadRemain { - part: PartInfoPtr, - data_block: DataBlock, - filter: Value, - }, - MergeRemain { - part: PartInfoPtr, - chunks: DataChunks, - data_block: DataBlock, - filter: Value, - }, - PerformOperator(DataBlock), - Output(Option, DataBlock), - Finish, -} - -pub struct UpdateSource { - state: State, - output: Arc, - scan_progress: Arc, - - ctx: Arc, - filter: Arc>, - block_reader: Arc, - remain_reader: Arc>, - operators: Vec, - mutation: MutationOperator, - - output_schema: TableSchemaRef, - index: BlockMetaIndex, - origin_stats: Option, -} - -impl UpdateSource { - #![allow(clippy::too_many_arguments)] - pub fn try_create( - ctx: Arc, - mutation: MutationOperator, - output: Arc, - filter: Arc>, - block_reader: Arc, - remain_reader: Arc>, - operators: Vec, - output_schema: TableSchemaRef, - ) -> Result { - let scan_progress = ctx.get_scan_progress(); - Ok(ProcessorPtr::create(Box::new(UpdateSource { - state: State::ReadData(None), - output, - scan_progress, - ctx: ctx.clone(), - filter, - block_reader, - remain_reader, - operators, - mutation, - output_schema, - index: BlockMetaIndex::default(), - origin_stats: None, - }))) - } -} - -#[async_trait::async_trait] -impl Processor for UpdateSource { - fn name(&self) -> String { - "UpdateSource".to_string() - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn event(&mut self) -> Result { - if matches!(self.state, State::ReadData(None)) { - self.state = match self.ctx.try_get_part() { - None => State::Finish, - Some(part) => State::ReadData(Some(part)), - } - } - - if matches!(self.state, State::Finish) { - self.output.finish(); - return Ok(Event::Finished); - } - - if self.output.is_finished() { - return Ok(Event::Finished); - } - - if !self.output.can_push() { - return Ok(Event::NeedConsume); - } - - if matches!(self.state, State::Output(_, _)) { - if let State::Output(part, data_block) = - std::mem::replace(&mut self.state, State::Finish) - { - self.state = match part { - None => State::Finish, - Some(part) => State::ReadData(Some(part)), - }; - - self.output.push_data(Ok(data_block)); - return Ok(Event::NeedConsume); - } - } - - if matches!(self.state, State::ReadData(_) | State::ReadRemain { .. }) { - Ok(Event::Async) - } else { - Ok(Event::Sync) - } - } - - fn process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Finish) { - State::FilterData(part, chunks) => { - let mut data_block = self - .block_reader - .deserialize_parquet_chunks(part.clone(), chunks)?; - let num_rows = data_block.num_rows(); - - if let Some(filter) = self.filter.as_ref() { - let func_ctx = self.ctx.try_get_function_context()?; - let evaluator = Evaluator::new(&data_block, func_ctx, &BUILTIN_FUNCTIONS); - - let res = evaluator.run(filter).map_err(|(_, e)| { - ErrorCode::Internal(format!("eval filter failed: {}.", e)) - })?; - let predicates = DataBlock::cast_to_nonull_boolean(&res).ok_or_else(|| { - ErrorCode::BadArguments( - "Result of filter expression cannot be converted to boolean.", - ) - })?; - - let affect_rows = match &predicates { - Value::Scalar(v) => { - if *v { - num_rows - } else { - 0 - } - } - Value::Column(bitmap) => bitmap.len() - bitmap.unset_bits(), - }; - - if affect_rows != 0 { - let progress_values = ProgressValues { - rows: affect_rows, - bytes: 0, - }; - self.scan_progress.incr(&progress_values); - - match self.mutation { - MutationOperator::Deletion => { - if affect_rows == num_rows { - // all the rows should be removed. - let meta = - SerializeDataMeta::create(self.index, self.origin_stats); - self.state = State::Output( - self.ctx.try_get_part(), - DataBlock::empty_with_meta(meta), - ); - } else { - let predicate_col = predicates.into_column().unwrap(); - let filter = - Value::Column(Column::Boolean(predicate_col.not())); - data_block = data_block.filter(&filter)?; - if self.remain_reader.is_none() { - let meta = SerializeDataMeta::create( - self.index, - self.origin_stats, - ); - self.state = State::Output( - self.ctx.try_get_part(), - data_block.add_meta(Some(meta))?, - ); - } else { - self.state = State::ReadRemain { - part, - data_block, - filter, - } - } - } - } - MutationOperator::Update => { - let filter = Value::upcast(predicates); - if self.remain_reader.is_none() { - data_block.add_column(BlockEntry { - data_type: DataType::Boolean, - value: filter, - }); - self.state = State::PerformOperator(data_block); - } else { - self.state = State::ReadRemain { - part, - data_block, - filter, - }; - } - } - } - } else { - // Do nothing. - self.state = State::Output(self.ctx.try_get_part(), DataBlock::empty()); - } - } else { - let progress_values = ProgressValues { - rows: num_rows, - bytes: 0, - }; - self.scan_progress.incr(&progress_values); - self.state = State::PerformOperator(data_block); - } - } - State::MergeRemain { - part, - chunks, - mut data_block, - filter, - } => { - if let Some(remain_reader) = self.remain_reader.as_ref() { - let remain_block = remain_reader.deserialize_parquet_chunks(part, chunks)?; - - match self.mutation { - MutationOperator::Deletion => { - let remain_block = remain_block.filter(&filter)?; - for col in remain_block.columns() { - data_block.add_column(col.clone()); - } - } - MutationOperator::Update => { - for col in remain_block.columns() { - data_block.add_column(col.clone()); - } - data_block.add_column(BlockEntry { - data_type: DataType::Boolean, - value: filter, - }); - } - } - data_block - } else { - return Err(ErrorCode::Internal("It's a bug. Need remain reader")); - }; - - self.state = State::PerformOperator(data_block); - } - State::PerformOperator(data_block) => { - let func_ctx = self.ctx.try_get_function_context()?; - let block = self - .operators - .iter() - .try_fold(data_block, |input, op| op.execute(&func_ctx, input))?; - let meta = SerializeDataMeta::create(self.index, self.origin_stats); - self.state = State::Output(self.ctx.try_get_part(), block.add_meta(Some(meta))?); - } - _ => return Err(ErrorCode::Internal("It's a bug.")), - } - Ok(()) - } - - async fn async_process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Finish) { - State::ReadData(Some(part)) => { - let settings = ReadSettings::from_ctx(&self.ctx)?; - let part = MutationPartInfo::from_part(&part)?; - self.index = part.index; - self.origin_stats = part.cluster_stats.clone(); - let inner_part = part.inner_part.clone(); - let fuse_part = FusePartInfo::from_part(&inner_part)?; - - let read_res = self - .block_reader - .read_columns_data_by_merge_io( - &settings, - &fuse_part.location, - &fuse_part.columns_meta, - ) - .await?; - let chunks = read_res - .columns_chunks()? - .into_iter() - .map(|(column_idx, column_chunk)| (column_idx, column_chunk.to_vec())) - .collect::>(); - self.state = State::FilterData(inner_part, chunks); - } - State::ReadRemain { - part, - data_block, - filter, - } => { - if let Some(remain_reader) = self.remain_reader.as_ref() { - let fuse_part = FusePartInfo::from_part(&part)?; - - let settings = ReadSettings::from_ctx(&self.ctx)?; - let read_res = remain_reader - .read_columns_data_by_merge_io( - &settings, - &fuse_part.location, - &fuse_part.columns_meta, - ) - .await?; - let chunks = read_res - .columns_chunks()? - .into_iter() - .map(|(column_idx, column_chunk)| (column_idx, column_chunk.to_vec())) - .collect::>(); - - self.state = State::MergeRemain { - part, - chunks, - data_block, - filter, - }; - } else { - return Err(ErrorCode::Internal("It's a bug. No remain reader")); - } - } - _ => return Err(ErrorCode::Internal("It's a bug.")), - } - Ok(()) - } -} diff --git a/src/query/storages/fuse/src/operations/mutation/util.rs b/src/query/storages/fuse/src/operations/mutation/util.rs deleted file mode 100644 index e54ffc800200..000000000000 --- a/src/query/storages/fuse/src/operations/mutation/util.rs +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2023 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub type DataChunks = Vec<(usize, Vec)>; - -pub struct SerializeState { - pub block_data: Vec, - pub block_location: String, - pub index_data: Vec, - pub index_location: String, -} From 09f50c8992031e96795804b088ac87300136d0f3 Mon Sep 17 00:00:00 2001 From: zhyass Date: Wed, 11 Jan 2023 01:51:28 +0800 Subject: [PATCH 20/26] remove unused codes --- .../src/operations/mutation/mutation_sink.rs | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_sink.rs b/src/query/storages/fuse/src/operations/mutation/mutation_sink.rs index 6e9e4131018e..ca2fb511212a 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_sink.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_sink.rs @@ -15,8 +15,6 @@ use std::any::Any; use std::sync::Arc; -use common_base::base::Progress; -use common_base::base::ProgressValues; use common_catalog::table::Table; use common_catalog::table::TableExt; use common_catalog::table_context::TableContext; @@ -65,7 +63,6 @@ pub struct MutationSink { ctx: Arc, dal: Operator, location_gen: TableMetaLocationGenerator, - scan_progress: Arc, table: Arc, base_snapshot: Arc, @@ -87,13 +84,11 @@ impl MutationSink { base_snapshot: Arc, input: Arc, ) -> Result { - let scan_progress = ctx.get_scan_progress(); Ok(ProcessorPtr::create(Box::new(MutationSink { state: State::None, ctx, dal: table.get_operator(), location_gen: table.meta_location_generator.clone(), - scan_progress, table: Arc::new(table.clone()), base_snapshot, merged_segments: vec![], @@ -160,22 +155,6 @@ impl Processor for MutationSink { State::ReadMeta(input_meta) => { let meta = MutationSinkMeta::from_meta(&input_meta)?; - let affect_rows = self - .base_snapshot - .summary - .row_count - .abs_diff(meta.summary.row_count); - let affect_bytes = self - .base_snapshot - .summary - .uncompressed_byte_size - .abs_diff(meta.summary.uncompressed_byte_size); - let progress_values = ProgressValues { - rows: affect_rows as usize, - bytes: affect_bytes as usize, - }; - self.scan_progress.incr(&progress_values); - self.merged_segments = meta.segments.clone(); self.merged_statistics = meta.summary.clone(); self.abort_operation = meta.abort_operation.clone(); From 0dcc1497a530cb0573bf728ee6ebf1e0ec88e74c Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 13 Jan 2023 01:31:06 +0800 Subject: [PATCH 21/26] update --- .../src/interpreters/interpreter_update.rs | 73 ++++++++-- .../storages/fuse/operations/mutation/mod.rs | 1 - .../fuse/operations/mutation/update.rs | 124 ----------------- src/query/sql/src/planner/binder/delete.rs | 10 -- src/query/sql/src/planner/binder/update.rs | 4 +- src/query/sql/src/planner/mod.rs | 1 + src/query/sql/src/planner/plans/delete.rs | 4 - src/query/sql/src/planner/plans/update.rs | 6 +- src/query/storages/fuse/src/fuse_table.rs | 1 - .../storages/fuse/src/operations/delete.rs | 2 +- .../operations/mutation/mutation_source.rs | 16 ++- .../mutation/serialize_data_transform.rs | 11 +- .../storages/fuse/src/operations/update.rs | 129 ++++++++---------- 13 files changed, 143 insertions(+), 239 deletions(-) delete mode 100644 src/query/service/tests/it/storages/fuse/operations/mutation/update.rs diff --git a/src/query/service/src/interpreters/interpreter_update.rs b/src/query/service/src/interpreters/interpreter_update.rs index f23ea07a0489..4eb3856563dd 100644 --- a/src/query/service/src/interpreters/interpreter_update.rs +++ b/src/query/service/src/interpreters/interpreter_update.rs @@ -14,10 +14,19 @@ use std::sync::Arc; -use common_datavalues::DataSchemaRef; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::types::DataType; +use common_expression::DataSchema; +use common_expression::DataSchemaRef; use common_pipeline_core::Pipeline; +use common_sql::plans::BoundColumnRef; +use common_sql::plans::CastExpr; +use common_sql::plans::FunctionCall; +use common_sql::BindContext; +use common_sql::ColumnBinding; +use common_sql::Scalar; +use common_sql::Visibility; use crate::interpreters::Interpreter; use crate::pipelines::executor::ExecutorSettings; @@ -25,7 +34,6 @@ use crate::pipelines::executor::PipelineCompleteExecutor; use crate::pipelines::PipelineBuildResult; use crate::sessions::QueryContext; use crate::sessions::TableContext; -use crate::sql::executor::ExpressionBuilderWithoutRenaming; use crate::sql::plans::ScalarExpr; use crate::sql::plans::UpdatePlan; @@ -62,22 +70,69 @@ impl Interpreter for UpdateInterpreter { let tbl_name = self.plan.table.as_str(); let tbl = self.ctx.get_table(catalog_name, db_name, tbl_name).await?; - let eb = ExpressionBuilderWithoutRenaming::create(self.plan.metadata.clone()); // TODO(zhyass): selection and update_list support subquery. let (filter, col_indices) = if let Some(scalar) = &self.plan.selection { - ( - Some(eb.build(scalar)?), - scalar.used_columns().into_iter().collect(), - ) + let filter = scalar.as_expr()?.as_remote_expr(); + let col_indices = scalar.used_columns().into_iter().collect(); + (Some(filter), col_indices) } else { (None, vec![]) }; + let predicate = Scalar::BoundColumnRef(BoundColumnRef { + column: ColumnBinding { + database_name: None, + table_name: None, + column_name: "_predicate".to_string(), + index: tbl.schema().num_fields(), + data_type: Box::new(DataType::Boolean), + visibility: Visibility::Visible, + }, + }); + + let schema: DataSchema = tbl.schema().into(); let update_list = self.plan.update_list.iter().try_fold( Vec::with_capacity(self.plan.update_list.len()), |mut acc, (id, scalar)| { - let expr = eb.build(scalar)?; - acc.push((*id, expr)); + let filed = schema.field(*id); + let left = Scalar::CastExpr(CastExpr { + argument: Box::new(scalar.clone()), + from_type: Box::new(scalar.data_type()), + target_type: Box::new(filed.data_type().clone()), + }); + let scalar = if col_indices.is_empty() { + // The condition is always true. + // Replace column to the result of the following expression: + // CAST(expression, type) + left + } else { + // Replace column to the result of the following expression: + // if(condition, CAST(expression, type), column) + let mut right = None; + for column_binding in self.plan.bind_context.columns.iter() { + if BindContext::match_column_binding( + Some(db_name), + Some(tbl_name), + filed.name(), + column_binding, + ) { + right = Some(Scalar::BoundColumnRef(BoundColumnRef { + column: column_binding.clone(), + })); + break; + } + } + let right = right.ok_or_else(|| ErrorCode::Internal("It's a bug"))?; + println!("right: {:?}", right); + let return_type = right.data_type(); + Scalar::FunctionCall(FunctionCall { + params: vec![], + arguments: vec![predicate.clone(), left, right], + func_name: "if".to_string(), + return_type: Box::new(return_type), + }) + }; + acc.push((*id, scalar.as_expr()?.as_remote_expr())); Ok::<_, ErrorCode>(acc) }, )?; diff --git a/src/query/service/tests/it/storages/fuse/operations/mutation/mod.rs b/src/query/service/tests/it/storages/fuse/operations/mutation/mod.rs index ec24f4309b8b..639ff7ebea08 100644 --- a/src/query/service/tests/it/storages/fuse/operations/mutation/mod.rs +++ b/src/query/service/tests/it/storages/fuse/operations/mutation/mod.rs @@ -16,6 +16,5 @@ mod block_compact_mutator; mod deletion; mod recluster_mutator; mod segments_compact_mutator; -mod update; pub use deletion::do_deletion; diff --git a/src/query/service/tests/it/storages/fuse/operations/mutation/update.rs b/src/query/service/tests/it/storages/fuse/operations/mutation/update.rs deleted file mode 100644 index 5ae69eb698e2..000000000000 --- a/src/query/service/tests/it/storages/fuse/operations/mutation/update.rs +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2023 Datafuse Labs. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use common_base::base::tokio; -use common_exception::ErrorCode; -use common_exception::Result; -use common_sql::executor::ExpressionBuilderWithoutRenaming; -use common_sql::plans::Plan; -use common_sql::plans::ScalarExpr; -use common_sql::plans::UpdatePlan; -use common_sql::Planner; -use common_storages_factory::Table; -use common_storages_fuse::FuseTable; -use databend_query::pipelines::executor::ExecutorSettings; -use databend_query::pipelines::executor::PipelineCompleteExecutor; -use databend_query::sessions::QueryContext; -use databend_query::sessions::TableContext; - -use crate::storages::fuse::table_test_fixture::execute_command; -use crate::storages::fuse::table_test_fixture::execute_query; -use crate::storages::fuse::table_test_fixture::expects_ok; -use crate::storages::fuse::table_test_fixture::TestFixture; - -#[tokio::test(flavor = "multi_thread")] -async fn test_update_mutator_multiple_empty_segments() -> Result<()> { - let fixture = TestFixture::new().await; - let ctx = fixture.ctx(); - let tbl_name = fixture.default_table_name(); - let db_name = fixture.default_db_name(); - - fixture.create_normal_table().await?; - - // insert - for i in 0..10 { - let qry = format!("insert into {}.{}(id) values({})", db_name, tbl_name, i); - execute_command(ctx.clone(), qry.as_str()).await?; - } - - let catalog = ctx.get_catalog(fixture.default_catalog_name().as_str())?; - let table = catalog - .get_table(ctx.get_tenant().as_str(), &db_name, &tbl_name) - .await?; - // update - let query = format!("update {}.{} set id=0 where id>0", db_name, tbl_name); - let mut planner = Planner::new(ctx.clone()); - let (plan, _, _) = planner.plan_sql(&query).await?; - if let Plan::Update(update) = plan { - do_update(ctx.clone(), table.clone(), *update).await?; - } - - // check count - let expected = vec![ - "+-------+", - "| count |", - "+-------+", - "| 10 |", - "+-------+", - ]; - let qry = format!( - "select count(1) as count from {}.{} where id=0", - db_name, tbl_name - ); - expects_ok( - "check count", - execute_query(fixture.ctx(), qry.as_str()).await, - expected, - ) - .await?; - Ok(()) -} - -pub async fn do_update( - ctx: Arc, - table: Arc, - plan: UpdatePlan, -) -> Result<()> { - let eb = ExpressionBuilderWithoutRenaming::create(plan.metadata.clone()); - let (filter, col_indices) = if let Some(scalar) = &plan.selection { - ( - Some(eb.build(scalar)?), - scalar.used_columns().into_iter().collect(), - ) - } else { - (None, vec![]) - }; - let update_list = plan.update_list.iter().try_fold( - Vec::with_capacity(plan.update_list.len()), - |mut acc, (id, scalar)| { - let expr = eb.build(scalar)?; - acc.push((*id, expr)); - Ok::<_, ErrorCode>(acc) - }, - )?; - - let fuse_table = FuseTable::try_from_table(table.as_ref())?; - let settings = ctx.get_settings(); - let mut pipeline = common_pipeline_core::Pipeline::create(); - fuse_table - .update(ctx.clone(), filter, col_indices, update_list, &mut pipeline) - .await?; - if !pipeline.pipes.is_empty() { - pipeline.set_max_threads(settings.get_max_threads()? as usize); - let query_id = ctx.get_id(); - let executor_settings = ExecutorSettings::try_create(&settings, query_id)?; - let executor = PipelineCompleteExecutor::try_create(pipeline, executor_settings)?; - ctx.set_executor(Arc::downgrade(&executor.get_inner())); - executor.execute()?; - drop(executor); - } - Ok(()) -} diff --git a/src/query/sql/src/planner/binder/delete.rs b/src/query/sql/src/planner/binder/delete.rs index 09fc0bef0dd7..f505f9cb0407 100644 --- a/src/query/sql/src/planner/binder/delete.rs +++ b/src/query/sql/src/planner/binder/delete.rs @@ -65,14 +65,6 @@ impl<'a> Binder { &[], ); - let table = self - .ctx - .get_table(&catalog_name, &database_name, &table_name) - .await?; - - let tbl_info = table.get_table_info(); - let table_id = tbl_info.ident; - let selection = if let Some(expr) = filter { let (scalar, _) = scalar_binder.bind(expr).await?; Some(scalar) @@ -84,8 +76,6 @@ impl<'a> Binder { catalog_name, database_name, table_name, - table_id, - metadata: self.metadata.clone(), selection, }; Ok(Plan::Delete(Box::new(plan))) diff --git a/src/query/sql/src/planner/binder/update.rs b/src/query/sql/src/planner/binder/update.rs index 77dcd39abf3d..da5b56c05530 100644 --- a/src/query/sql/src/planner/binder/update.rs +++ b/src/query/sql/src/planner/binder/update.rs @@ -67,7 +67,6 @@ impl<'a> Binder { .ctx .get_table(&catalog_name, &database_name, &table_name) .await?; - let table_id = table.get_id(); let mut scalar_binder = ScalarBinder::new( &context, @@ -103,10 +102,9 @@ impl<'a> Binder { catalog: catalog_name, database: database_name, table: table_name, - table_id, - metadata: self.metadata.clone(), update_list: update_columns, selection: push_downs, + bind_context: Box::new(context.clone()), }; Ok(Plan::Update(Box::new(plan))) } diff --git a/src/query/sql/src/planner/mod.rs b/src/query/sql/src/planner/mod.rs index cada12f16f57..e417a7dc8db8 100644 --- a/src/query/sql/src/planner/mod.rs +++ b/src/query/sql/src/planner/mod.rs @@ -34,6 +34,7 @@ pub use expression_parser::parse_exprs; pub use expression_parser::parse_to_remote_string_exprs; pub use metadata::*; pub use planner::Planner; +pub use plans::Scalar; pub use plans::ScalarExpr; pub use semantic::normalize_identifier; pub use semantic::validate_function_arg; diff --git a/src/query/sql/src/planner/plans/delete.rs b/src/query/sql/src/planner/plans/delete.rs index 0d1f6090b972..6ad0eccaf909 100644 --- a/src/query/sql/src/planner/plans/delete.rs +++ b/src/query/sql/src/planner/plans/delete.rs @@ -16,18 +16,14 @@ use std::sync::Arc; use common_expression::DataSchema; use common_expression::DataSchemaRef; -use common_meta_app::schema::TableIdent; use crate::plans::Scalar; -use crate::MetadataRef; #[derive(Clone, Debug)] pub struct DeletePlan { pub catalog_name: String, pub database_name: String, pub table_name: String, - pub table_id: TableIdent, - pub metadata: MetadataRef, pub selection: Option, } diff --git a/src/query/sql/src/planner/plans/update.rs b/src/query/sql/src/planner/plans/update.rs index 186afc9a9e81..7ce2469b5568 100644 --- a/src/query/sql/src/planner/plans/update.rs +++ b/src/query/sql/src/planner/plans/update.rs @@ -17,20 +17,18 @@ use std::sync::Arc; use common_expression::DataSchema; use common_expression::DataSchemaRef; -use common_meta_types::MetaId; use crate::plans::Scalar; -use crate::MetadataRef; +use crate::BindContext; #[derive(Clone, Debug)] pub struct UpdatePlan { pub catalog: String, pub database: String, pub table: String, - pub table_id: MetaId, - pub metadata: MetadataRef, pub update_list: HashMap, pub selection: Option, + pub bind_context: Box, } impl UpdatePlan { diff --git a/src/query/storages/fuse/src/fuse_table.rs b/src/query/storages/fuse/src/fuse_table.rs index eefec7fc26a7..d1d1f4c4fe75 100644 --- a/src/query/storages/fuse/src/fuse_table.rs +++ b/src/query/storages/fuse/src/fuse_table.rs @@ -36,7 +36,6 @@ use common_exception::ErrorCode; use common_exception::Result; use common_expression::BlockCompactThresholds; use common_expression::DataBlock; -// use common_sql::ExpressionParser; use common_expression::RemoteExpr; use common_meta_app::schema::DatabaseType; use common_meta_app::schema::TableInfo; diff --git a/src/query/storages/fuse/src/operations/delete.rs b/src/query/storages/fuse/src/operations/delete.rs index d9e2256d334f..1863685a7a12 100644 --- a/src/query/storages/fuse/src/operations/delete.rs +++ b/src/query/storages/fuse/src/operations/delete.rs @@ -144,7 +144,7 @@ impl FuseTable { Ok(()) } - fn try_eval_const( + pub fn try_eval_const( &self, ctx: Arc, schema: &TableSchema, diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_source.rs b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs index 00140ba8efcd..94980c35018e 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs @@ -88,6 +88,7 @@ pub struct MutationSource { } impl MutationSource { + #![allow(clippy::too_many_arguments)] pub fn try_create( ctx: Arc, action: MutationAction, @@ -209,8 +210,10 @@ impl Processor for MutationSource { MutationAction::Deletion => { if affect_rows == num_rows { // all the rows should be removed. - let meta = - SerializeDataMeta::create(self.index, self.origin_stats); + let meta = SerializeDataMeta::create( + self.index.clone(), + self.origin_stats.clone(), + ); self.state = State::Output( self.ctx.try_get_part(), DataBlock::empty_with_meta(meta), @@ -222,8 +225,8 @@ impl Processor for MutationSource { data_block = data_block.filter(&filter)?; if self.remain_reader.is_none() { let meta = SerializeDataMeta::create( - self.index, - self.origin_stats, + self.index.clone(), + self.origin_stats.clone(), ); self.state = State::Output( self.ctx.try_get_part(), @@ -294,7 +297,6 @@ impl Processor for MutationSource { }); } } - data_block } else { return Err(ErrorCode::Internal("It's a bug. Need remain reader")); }; @@ -307,7 +309,7 @@ impl Processor for MutationSource { .operators .iter() .try_fold(data_block, |input, op| op.execute(&func_ctx, input))?; - let meta = SerializeDataMeta::create(self.index, self.origin_stats); + let meta = SerializeDataMeta::create(self.index.clone(), self.origin_stats.clone()); self.state = State::Output(self.ctx.try_get_part(), block.add_meta(Some(meta))?); } _ => return Err(ErrorCode::Internal("It's a bug.")), @@ -320,7 +322,7 @@ impl Processor for MutationSource { State::ReadData(Some(part)) => { let settings = ReadSettings::from_ctx(&self.ctx)?; let part = MutationPartInfo::from_part(&part)?; - self.index = part.index; + self.index = part.index.clone(); self.origin_stats = part.cluster_stats.clone(); let inner_part = part.inner_part.clone(); let fuse_part = FusePartInfo::from_part(&inner_part)?; diff --git a/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs b/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs index 88f9552820c9..dd3983f9cd95 100644 --- a/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs +++ b/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs @@ -144,18 +144,17 @@ impl Processor for SerializeDataTransform { let mut input_data = self.input.pull_data().unwrap()?; let meta = input_data.take_meta(); - if meta.is_none() { - self.state = State::Output(Mutation::DoNothing); - } else { - let meta = meta.unwrap(); + if let Some(meta) = meta { let meta = SerializeDataMeta::from_meta(&meta)?; - self.index = meta.index; + self.index = meta.index.clone(); self.origin_stats = meta.cluster_stats.clone(); if input_data.is_empty() { self.state = State::Output(Mutation::Deleted); } else { self.state = State::NeedSerialize(input_data); } + } else { + self.state = State::Output(Mutation::DoNothing); } Ok(Event::Sync) } @@ -217,7 +216,7 @@ impl Processor for SerializeDataTransform { ); } State::Output(op) => { - let meta = MutationTransformMeta::create(self.index, op); + let meta = MutationTransformMeta::create(self.index.clone(), op); self.output_data = Some(DataBlock::empty_with_meta(meta)); } _ => return Err(ErrorCode::Internal("It's a bug.")), diff --git a/src/query/storages/fuse/src/operations/update.rs b/src/query/storages/fuse/src/operations/update.rs index b1eec354d28c..03f0617dd201 100644 --- a/src/query/storages/fuse/src/operations/update.rs +++ b/src/query/storages/fuse/src/operations/update.rs @@ -15,23 +15,28 @@ use std::collections::BTreeMap; use std::sync::Arc; -use common_catalog::plan::Expression; use common_catalog::plan::Partitions; use common_catalog::plan::PartitionsShuffleKind; use common_catalog::plan::Projection; use common_catalog::plan::PushDownInfo; use common_catalog::table::Table; use common_catalog::table_context::TableContext; -use common_datavalues::prelude::*; use common_exception::Result; -use common_sql::evaluator::ChunkOperator; -use common_sql::evaluator::Evaluator; - +use common_expression::RemoteExpr; +use common_expression::TableDataType; +use common_expression::TableField; +use common_expression::TableSchema; +use common_functions::scalars::BUILTIN_FUNCTIONS; +use common_sql::evaluator::BlockOperator; + +use crate::operations::mutation::MutationAction; use crate::operations::mutation::MutationPartInfo; use crate::operations::mutation::MutationSink; -use crate::operations::mutation::UpdateSource; +use crate::operations::mutation::MutationSource; +use crate::operations::mutation::SerializeDataTransform; use crate::pipelines::Pipeline; use crate::pruning::BlockPruner; +use crate::statistics::ClusterStatsGenerator; use crate::FuseTable; impl FuseTable { @@ -47,9 +52,9 @@ impl FuseTable { pub async fn do_update( &self, ctx: Arc, - filter: Option, + filter: Option>, col_indices: Vec, - update_list: Vec<(usize, Expression)>, + update_list: Vec<(usize, RemoteExpr)>, pipeline: &mut Pipeline, ) -> Result<()> { let snapshot_opt = self.read_table_snapshot().await?; @@ -69,11 +74,13 @@ impl FuseTable { let all_col_ids = self.all_the_columns_ids(); let schema = self.schema(); - let mut operators = Vec::with_capacity(update_list.len() + 2); + let mut ops = Vec::with_capacity(update_list.len() + 2); let mut offset_map = BTreeMap::new(); let mut remain_reader = None; let (projection, filters) = if col_indices.is_empty() { - if filter.is_some() && !self.try_eval_const(&filter.unwrap())? { + if filter.is_some() + && !self.try_eval_const(ctx.clone(), &self.schema(), &filter.unwrap())? + { // The condition is always false, do nothing. return Ok(()); } @@ -85,20 +92,13 @@ impl FuseTable { acc }); - // The condition is always true. - // Replace column to the result of the following expression: - // CAST(expression, type) - for (id, expr) in update_list.into_iter() { - let field = schema.field(id); - let target = field.data_type(); - let new_expr = Expression::Cast { - input: Box::new(expr), - target: target.clone(), - }; - operators.push(ChunkOperator::Map { - eval: Evaluator::eval_expression(&new_expr, &schema)?, - name: format!("new_{}", field.name()), - }); + for (id, remote_expr) in update_list.into_iter() { + let expr = remote_expr + .as_expr(&BUILTIN_FUNCTIONS) + .unwrap() + .project_column_ref(|name| schema.index_of(name).unwrap()); + + ops.push(BlockOperator::Map { expr }); offset_map.insert(id, pos); pos += 1; } @@ -111,6 +111,10 @@ impl FuseTable { acc }); + let mut fields: Vec = col_indices + .iter() + .map(|idx| schema.fields()[*idx].clone()) + .collect(); let remain_col_ids: Vec = all_col_ids .into_iter() .filter(|id| !col_indices.contains(id)) @@ -122,42 +126,21 @@ impl FuseTable { acc }); - remain_reader = - Some((*self.create_block_reader(Projection::Columns(remain_col_ids))?).clone()); + let reader = self.create_block_reader(Projection::Columns(remain_col_ids))?; + fields.extend_from_slice(reader.schema().fields()); + remain_reader = Some((*reader).clone()); } - let mut fields = schema.fields().clone(); - fields.push(DataField::new("_predicate", bool::to_data_type())); - let input_schema = Arc::new(DataSchema::new(fields)); + fields.push(TableField::new("_predicate", TableDataType::Boolean)); + let input_schema = Arc::new(TableSchema::new(fields)); pos += 1; - // Replace column to the result of the following expression: - // if(condition, CAST(expression, type), column) - for (id, expr) in update_list.into_iter() { - let field = schema.field(id); - let target = field.data_type(); - let new_expr = Expression::Function { - name: "if".to_string(), - args: vec![ - Expression::IndexedVariable { - name: "_predicate".to_string(), - data_type: bool::to_data_type(), - }, - Expression::Cast { - input: Box::new(expr), - target: target.clone(), - }, - Expression::IndexedVariable { - name: field.name().clone(), - data_type: target.clone(), - }, - ], - return_type: target.clone(), - }; - operators.push(ChunkOperator::Map { - eval: Evaluator::eval_expression(&new_expr, &input_schema)?, - name: format!("new_{}", field.name()), - }); + for (id, remote_expr) in update_list.into_iter() { + let expr = remote_expr + .as_expr(&BUILTIN_FUNCTIONS) + .unwrap() + .project_column_ref(|name| input_schema.index_of(name).unwrap()); + ops.push(BlockOperator::Map { expr }); offset_map.insert(id, pos); pos += 1; } @@ -167,20 +150,18 @@ impl FuseTable { ]) }; - let offsets = offset_map.values().cloned().collect::>(); - operators.push(ChunkOperator::Project { offsets }); - operators.push(ChunkOperator::Rename { - output_schema: schema, + ops.push(BlockOperator::Project { + projection: offset_map.values().cloned().collect(), }); let block_reader = self.create_block_reader(projection.clone())?; - let eval_node = if filters.is_empty() { + let filter = if filters.is_empty() { Arc::new(None) } else { - Arc::new(Some(Evaluator::eval_expression( - &filters[0], - block_reader.schema().as_ref(), - )?)) + let schema = block_reader.schema(); + Arc::new(filters[0].as_expr(&BUILTIN_FUNCTIONS).map(|expr| { + expr.project_column_ref(|name| schema.column_with_name(name).unwrap().0) + })) }; let remain_reader = Arc::new(remain_reader); @@ -230,19 +211,29 @@ impl FuseTable { // Add source pipe. pipeline.add_source( |output| { - UpdateSource::try_create( + MutationSource::try_create( ctx.clone(), + MutationAction::Update, output, - self, + filter.clone(), block_reader.clone(), - eval_node.clone(), remain_reader.clone(), - operators.clone(), + ops.clone(), ) }, max_threads, )?; + pipeline.add_transform(|input, output| { + SerializeDataTransform::try_create( + ctx.clone(), + input, + output, + self, + ClusterStatsGenerator::default(), + ) + })?; + self.try_add_mutation_transform(ctx.clone(), snapshot.segments.clone(), pipeline)?; pipeline.add_sink(|input| { MutationSink::try_create(self, ctx.clone(), snapshot.clone(), input) From 17b9c0b8007f94e7e3096f471a2f516ac22f4956 Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 13 Jan 2023 01:45:19 +0800 Subject: [PATCH 22/26] fix bug --- .../src/operations/mutation/serialize_data_transform.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs b/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs index ce04eff34a20..b453d18d32fd 100644 --- a/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs +++ b/src/query/storages/fuse/src/operations/mutation/serialize_data_transform.rs @@ -174,14 +174,14 @@ impl Processor for SerializeDataTransform { let location = self.location_gen.block_bloom_index_location(&block_id); let bloom_index_state = BloomIndexState::try_create( self.ctx.clone(), - self.source_schema.clone(), + self.schema.clone(), &block, location, )?; let column_distinct_count = bloom_index_state .as_ref() .map(|i| i.column_distinct_count.clone()); - let col_stats = gen_columns_statistics(&block, Some(column_distinct_count))?; + let col_stats = gen_columns_statistics(&block, column_distinct_count)?; // serialize data block. let mut block_data = Vec::with_capacity(100 * 1024 * 1024); @@ -254,7 +254,7 @@ impl Processor for SerializeDataTransform { { write_data(&index_data, &self.dal, &index_location).await?; } - + self.state = State::Output(Mutation::Replaced(block_meta)); } _ => return Err(ErrorCode::Internal("It's a bug.")), From 5696a6c2b62cc896d0c6baeefcc893dd36ca8c37 Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 13 Jan 2023 10:30:41 +0800 Subject: [PATCH 23/26] format codes --- .../operations/mutation/mutation_source.rs | 9 +- .../storages/fuse/src/operations/update.rs | 129 ++++++++++-------- 2 files changed, 73 insertions(+), 65 deletions(-) diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_source.rs b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs index 94980c35018e..cbbb20f5c9c9 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs @@ -224,14 +224,7 @@ impl Processor for MutationSource { Value::Column(Column::Boolean(predicate_col.not())); data_block = data_block.filter(&filter)?; if self.remain_reader.is_none() { - let meta = SerializeDataMeta::create( - self.index.clone(), - self.origin_stats.clone(), - ); - self.state = State::Output( - self.ctx.try_get_part(), - data_block.add_meta(Some(meta))?, - ); + self.state = State::PerformOperator(data_block); } else { self.state = State::ReadRemain { part, diff --git a/src/query/storages/fuse/src/operations/update.rs b/src/query/storages/fuse/src/operations/update.rs index 03f0617dd201..54594307154f 100644 --- a/src/query/storages/fuse/src/operations/update.rs +++ b/src/query/storages/fuse/src/operations/update.rs @@ -1,4 +1,4 @@ -// Copyright 2021 Datafuse Labs. +// Copyright 2023 Datafuse Labs. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ use common_expression::TableField; use common_expression::TableSchema; use common_functions::scalars::BUILTIN_FUNCTIONS; use common_sql::evaluator::BlockOperator; +use storages_common_table_meta::meta::TableSnapshot; use crate::operations::mutation::MutationAction; use crate::operations::mutation::MutationPartInfo; @@ -41,14 +42,7 @@ use crate::FuseTable; impl FuseTable { /// UPDATE column = expression WHERE condition - /// The flow of Pipeline is as follows: - /// +-------------+ - /// |UpdateSource1| ------ - /// +-------------+ | +-----------------+ +------------+ - /// | ... | ... | ---> |MutationTransform| ---> |MutationSink| - /// +-------------+ | +-----------------+ +------------+ - /// |UpdateSourceN| ------ - /// +-------------+ + /// The flow of Pipeline is the same as that of deletion. pub async fn do_update( &self, ctx: Arc, @@ -72,24 +66,61 @@ impl FuseTable { return Ok(()); } - let all_col_ids = self.all_the_columns_ids(); - let schema = self.schema(); - let mut ops = Vec::with_capacity(update_list.len() + 2); - let mut offset_map = BTreeMap::new(); - let mut remain_reader = None; - let (projection, filters) = if col_indices.is_empty() { - if filter.is_some() - && !self.try_eval_const(ctx.clone(), &self.schema(), &filter.unwrap())? - { + if col_indices.is_empty() && filter.is_some() { + let filter_expr = filter.clone().unwrap(); + if !self.try_eval_const(ctx.clone(), &self.schema(), &filter_expr)? { // The condition is always false, do nothing. return Ok(()); } + } + + self.try_add_update_source( + ctx.clone(), + filter, + col_indices, + update_list, + &snapshot, + pipeline, + ) + .await?; + pipeline.add_transform(|input, output| { + SerializeDataTransform::try_create( + ctx.clone(), + input, + output, + self, + ClusterStatsGenerator::default(), + ) + })?; + + self.try_add_mutation_transform(ctx.clone(), snapshot.segments.clone(), pipeline)?; + + pipeline.add_sink(|input| { + MutationSink::try_create(self, ctx.clone(), snapshot.clone(), input) + })?; + Ok(()) + } + + async fn try_add_update_source( + &self, + ctx: Arc, + filter: Option>, + col_indices: Vec, + update_list: Vec<(usize, RemoteExpr)>, + base_snapshot: &TableSnapshot, + pipeline: &mut Pipeline, + ) -> Result<()> { + let all_col_ids = self.all_the_columns_ids(); + let schema = self.schema(); + let mut ops = Vec::with_capacity(update_list.len() + 1); + let mut offset_map = BTreeMap::new(); + let mut remain_reader = None; + let projection = if col_indices.is_empty() { let mut pos = 0; - offset_map = all_col_ids.iter().fold(offset_map, |mut acc, id| { - acc.insert(*id, pos); + all_col_ids.iter().for_each(|&id| { + offset_map.insert(id, pos); pos += 1; - acc }); for (id, remote_expr) in update_list.into_iter() { @@ -102,13 +133,12 @@ impl FuseTable { offset_map.insert(id, pos); pos += 1; } - (Projection::Columns(all_col_ids), vec![]) + Projection::Columns(all_col_ids) } else { let mut pos = 0; - offset_map = col_indices.iter().fold(offset_map, |mut acc, id| { - acc.insert(*id, pos); + col_indices.iter().for_each(|&id| { + offset_map.insert(id, pos); pos += 1; - acc }); let mut fields: Vec = col_indices @@ -120,10 +150,9 @@ impl FuseTable { .filter(|id| !col_indices.contains(id)) .collect(); if !remain_col_ids.is_empty() { - offset_map = remain_col_ids.iter().fold(offset_map, |mut acc, id| { - acc.insert(*id, pos); + remain_col_ids.iter().for_each(|&id| { + offset_map.insert(id, pos); pos += 1; - acc }); let reader = self.create_block_reader(Projection::Columns(remain_col_ids))?; @@ -145,9 +174,7 @@ impl FuseTable { pos += 1; } - (Projection::Columns(col_indices.clone()), vec![ - filter.unwrap(), - ]) + Projection::Columns(col_indices.clone()) }; ops.push(BlockOperator::Project { @@ -155,13 +182,17 @@ impl FuseTable { }); let block_reader = self.create_block_reader(projection.clone())?; - let filter = if filters.is_empty() { - Arc::new(None) - } else { + + let (filter_expr, filters) = if let Some(filter) = filter { let schema = block_reader.schema(); - Arc::new(filters[0].as_expr(&BUILTIN_FUNCTIONS).map(|expr| { - expr.project_column_ref(|name| schema.column_with_name(name).unwrap().0) - })) + ( + Arc::new(filter.as_expr(&BUILTIN_FUNCTIONS).map(|expr| { + expr.project_column_ref(|name| schema.column_with_name(name).unwrap().0) + })), + vec![filter], + ) + } else { + (Arc::new(None), vec![]) }; let remain_reader = Arc::new(remain_reader); @@ -172,7 +203,7 @@ impl FuseTable { ..PushDownInfo::default() }); - let segments_location = snapshot.segments.clone(); + let segments_location = base_snapshot.segments.clone(); let block_metas = BlockPruner::prune( &ctx, self.operator.clone(), @@ -194,7 +225,7 @@ impl FuseTable { self.table_info.schema(), None, metas, - snapshot.summary.block_count as usize, + base_snapshot.summary.block_count as usize, )?; let parts = Partitions::create( @@ -215,29 +246,13 @@ impl FuseTable { ctx.clone(), MutationAction::Update, output, - filter.clone(), + filter_expr.clone(), block_reader.clone(), remain_reader.clone(), ops.clone(), ) }, max_threads, - )?; - - pipeline.add_transform(|input, output| { - SerializeDataTransform::try_create( - ctx.clone(), - input, - output, - self, - ClusterStatsGenerator::default(), - ) - })?; - - self.try_add_mutation_transform(ctx.clone(), snapshot.segments.clone(), pipeline)?; - pipeline.add_sink(|input| { - MutationSink::try_create(self, ctx.clone(), snapshot.clone(), input) - })?; - Ok(()) + ) } } From 1b53a03b03a1d931bf22b1a00d83da97a7580a5b Mon Sep 17 00:00:00 2001 From: zhyass Date: Fri, 13 Jan 2023 21:25:28 +0800 Subject: [PATCH 24/26] add mutation block pruning --- .../storages/fuse/src/operations/delete.rs | 94 +++++++++------- .../storages/fuse/src/operations/update.rs | 104 +++++------------- 2 files changed, 81 insertions(+), 117 deletions(-) diff --git a/src/query/storages/fuse/src/operations/delete.rs b/src/query/storages/fuse/src/operations/delete.rs index 1863685a7a12..94aecc8f15ed 100644 --- a/src/query/storages/fuse/src/operations/delete.rs +++ b/src/query/storages/fuse/src/operations/delete.rs @@ -189,48 +189,15 @@ impl FuseTable { pipeline: &mut Pipeline, ) -> Result<()> { let projection = Projection::Columns(col_indices.clone()); - let push_down = Some(PushDownInfo { - projection: Some(projection.clone()), - filters: vec![filter.clone()], - ..PushDownInfo::default() - }); - - let segments_location = base_snapshot.segments.clone(); - let block_metas = BlockPruner::prune( - &ctx, - self.operator.clone(), - self.table_info.schema(), - &push_down, - segments_location, + self.mutation_block_purning( + ctx.clone(), + vec![filter.clone()], + projection.clone(), + base_snapshot, ) .await?; - let mut index_stats = Vec::with_capacity(block_metas.len()); - let mut metas = Vec::with_capacity(block_metas.len()); - for (index, block_meta) in block_metas.into_iter() { - index_stats.push((index, block_meta.cluster_stats.clone())); - metas.push(block_meta); - } - - let (_, inner_parts) = self.read_partitions_with_metas( - ctx.clone(), - self.table_info.schema(), - None, - metas, - base_snapshot.summary.block_count as usize, - )?; - - let parts = Partitions::create( - PartitionsShuffleKind::Mod, - index_stats - .into_iter() - .zip(inner_parts.partitions.into_iter()) - .map(|((a, b), c)| MutationPartInfo::create(a, b, c)) - .collect(), - ); - ctx.try_set_partitions(parts)?; - - let block_reader = self.create_block_reader(projection.clone())?; + let block_reader = self.create_block_reader(projection)?; let schema = block_reader.schema(); let filter = Arc::new(filter.as_expr(&BUILTIN_FUNCTIONS).map(|expr| { @@ -275,6 +242,55 @@ impl FuseTable { ) } + pub async fn mutation_block_purning( + &self, + ctx: Arc, + filters: Vec>, + projection: Projection, + base_snapshot: &TableSnapshot, + ) -> Result<()> { + let push_down = Some(PushDownInfo { + projection: Some(projection), + filters, + ..PushDownInfo::default() + }); + + let segments_location = base_snapshot.segments.clone(); + let block_metas = BlockPruner::prune( + &ctx, + self.operator.clone(), + self.table_info.schema(), + &push_down, + segments_location, + ) + .await?; + + let mut index_stats = Vec::with_capacity(block_metas.len()); + let mut metas = Vec::with_capacity(block_metas.len()); + for (index, block_meta) in block_metas.into_iter() { + index_stats.push((index, block_meta.cluster_stats.clone())); + metas.push(block_meta); + } + + let (_, inner_parts) = self.read_partitions_with_metas( + ctx.clone(), + self.table_info.schema(), + None, + metas, + base_snapshot.summary.block_count as usize, + )?; + + let parts = Partitions::create( + PartitionsShuffleKind::Mod, + index_stats + .into_iter() + .zip(inner_parts.partitions.into_iter()) + .map(|((a, b), c)| MutationPartInfo::create(a, b, c)) + .collect(), + ); + ctx.try_set_partitions(parts) + } + pub fn try_add_mutation_transform( &self, ctx: Arc, diff --git a/src/query/storages/fuse/src/operations/update.rs b/src/query/storages/fuse/src/operations/update.rs index 54594307154f..7a4aee68c6ff 100644 --- a/src/query/storages/fuse/src/operations/update.rs +++ b/src/query/storages/fuse/src/operations/update.rs @@ -15,10 +15,7 @@ use std::collections::BTreeMap; use std::sync::Arc; -use common_catalog::plan::Partitions; -use common_catalog::plan::PartitionsShuffleKind; use common_catalog::plan::Projection; -use common_catalog::plan::PushDownInfo; use common_catalog::table::Table; use common_catalog::table_context::TableContext; use common_exception::Result; @@ -31,12 +28,10 @@ use common_sql::evaluator::BlockOperator; use storages_common_table_meta::meta::TableSnapshot; use crate::operations::mutation::MutationAction; -use crate::operations::mutation::MutationPartInfo; use crate::operations::mutation::MutationSink; use crate::operations::mutation::MutationSource; use crate::operations::mutation::SerializeDataTransform; use crate::pipelines::Pipeline; -use crate::pruning::BlockPruner; use crate::statistics::ClusterStatsGenerator; use crate::FuseTable; @@ -84,6 +79,7 @@ impl FuseTable { ) .await?; + // TODO(zhyass): support cluster stats generator. pipeline.add_transform(|input, output| { SerializeDataTransform::try_create( ctx.clone(), @@ -113,29 +109,18 @@ impl FuseTable { ) -> Result<()> { let all_col_ids = self.all_the_columns_ids(); let schema = self.schema(); - let mut ops = Vec::with_capacity(update_list.len() + 1); + let mut offset_map = BTreeMap::new(); let mut remain_reader = None; - let projection = if col_indices.is_empty() { - let mut pos = 0; + let mut pos = 0; + let (projection, input_schema) = if col_indices.is_empty() { all_col_ids.iter().for_each(|&id| { offset_map.insert(id, pos); pos += 1; }); - for (id, remote_expr) in update_list.into_iter() { - let expr = remote_expr - .as_expr(&BUILTIN_FUNCTIONS) - .unwrap() - .project_column_ref(|name| schema.index_of(name).unwrap()); - - ops.push(BlockOperator::Map { expr }); - offset_map.insert(id, pos); - pos += 1; - } - Projection::Columns(all_col_ids) + (Projection::Columns(all_col_ids), schema.clone()) } else { - let mut pos = 0; col_indices.iter().for_each(|&id| { offset_map.insert(id, pos); pos += 1; @@ -145,6 +130,7 @@ impl FuseTable { .iter() .map(|idx| schema.fields()[*idx].clone()) .collect(); + let remain_col_ids: Vec = all_col_ids .into_iter() .filter(|id| !col_indices.contains(id)) @@ -161,82 +147,44 @@ impl FuseTable { } fields.push(TableField::new("_predicate", TableDataType::Boolean)); - let input_schema = Arc::new(TableSchema::new(fields)); pos += 1; - for (id, remote_expr) in update_list.into_iter() { - let expr = remote_expr - .as_expr(&BUILTIN_FUNCTIONS) - .unwrap() - .project_column_ref(|name| input_schema.index_of(name).unwrap()); - ops.push(BlockOperator::Map { expr }); - offset_map.insert(id, pos); - pos += 1; - } - - Projection::Columns(col_indices.clone()) + ( + Projection::Columns(col_indices.clone()), + Arc::new(TableSchema::new(fields)), + ) }; + let mut ops = Vec::with_capacity(update_list.len() + 1); + for (id, remote_expr) in update_list.into_iter() { + let expr = remote_expr + .as_expr(&BUILTIN_FUNCTIONS) + .unwrap() + .project_column_ref(|name| input_schema.index_of(name).unwrap()); + ops.push(BlockOperator::Map { expr }); + offset_map.insert(id, pos); + pos += 1; + } ops.push(BlockOperator::Project { projection: offset_map.values().cloned().collect(), }); let block_reader = self.create_block_reader(projection.clone())?; - - let (filter_expr, filters) = if let Some(filter) = filter { + let remain_reader = Arc::new(remain_reader); + let (filter_expr, filters) = if let Some(remote_expr) = filter { let schema = block_reader.schema(); ( - Arc::new(filter.as_expr(&BUILTIN_FUNCTIONS).map(|expr| { + Arc::new(remote_expr.as_expr(&BUILTIN_FUNCTIONS).map(|expr| { expr.project_column_ref(|name| schema.column_with_name(name).unwrap().0) })), - vec![filter], + vec![remote_expr], ) } else { (Arc::new(None), vec![]) }; - let remain_reader = Arc::new(remain_reader); - - let push_down = Some(PushDownInfo { - projection: Some(projection), - filters, - ..PushDownInfo::default() - }); - - let segments_location = base_snapshot.segments.clone(); - let block_metas = BlockPruner::prune( - &ctx, - self.operator.clone(), - self.table_info.schema(), - &push_down, - segments_location, - ) - .await?; - - let mut indices = Vec::with_capacity(block_metas.len()); - let mut metas = Vec::with_capacity(block_metas.len()); - block_metas.into_iter().for_each(|(index, block_meta)| { - indices.push(index); - metas.push(block_meta); - }); - - let (_, inner_parts) = self.read_partitions_with_metas( - ctx.clone(), - self.table_info.schema(), - None, - metas, - base_snapshot.summary.block_count as usize, - )?; - - let parts = Partitions::create( - PartitionsShuffleKind::Mod, - indices - .into_iter() - .zip(inner_parts.partitions.into_iter()) - .map(|(a, b)| MutationPartInfo::create(a, None, b)) - .collect(), - ); - ctx.try_set_partitions(parts)?; + self.mutation_block_purning(ctx.clone(), filters, projection, base_snapshot) + .await?; let max_threads = ctx.get_settings().get_max_threads()? as usize; // Add source pipe. From b4ecec3892b39171eb4d1cfea3d044c861f1b46c Mon Sep 17 00:00:00 2001 From: zhyass Date: Sat, 14 Jan 2023 00:02:47 +0800 Subject: [PATCH 25/26] fix tests --- .../service/src/interpreters/interpreter_update.rs | 1 - src/query/sql/src/planner/binder/update.rs | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/query/service/src/interpreters/interpreter_update.rs b/src/query/service/src/interpreters/interpreter_update.rs index 4eb3856563dd..2ad62581340c 100644 --- a/src/query/service/src/interpreters/interpreter_update.rs +++ b/src/query/service/src/interpreters/interpreter_update.rs @@ -70,7 +70,6 @@ impl Interpreter for UpdateInterpreter { let tbl_name = self.plan.table.as_str(); let tbl = self.ctx.get_table(catalog_name, db_name, tbl_name).await?; - // TODO(zhyass): selection and update_list support subquery. let (filter, col_indices) = if let Some(scalar) = &self.plan.selection { let filter = scalar.as_expr()?.as_remote_expr(); let col_indices = scalar.used_columns().into_iter().collect(); diff --git a/src/query/sql/src/planner/binder/update.rs b/src/query/sql/src/planner/binder/update.rs index da5b56c05530..358386c3cd68 100644 --- a/src/query/sql/src/planner/binder/update.rs +++ b/src/query/sql/src/planner/binder/update.rs @@ -23,6 +23,7 @@ use crate::binder::Binder; use crate::binder::ScalarBinder; use crate::normalize_identifier; use crate::plans::Plan; +use crate::plans::Scalar; use crate::plans::UpdatePlan; use crate::BindContext; @@ -87,12 +88,23 @@ impl<'a> Binder { ))); } + // TODO(zhyass): selection and update_list support subquery. let (scalar, _) = scalar_binder.bind(&update_expr.expr).await?; + if matches!(scalar, Scalar::SubqueryExpr(_)) { + return Err(ErrorCode::Internal( + "Update does not support subquery temporarily", + )); + } update_columns.insert(index, scalar); } let push_downs = if let Some(expr) = selection { let (scalar, _) = scalar_binder.bind(expr).await?; + if matches!(scalar, Scalar::SubqueryExpr(_)) { + return Err(ErrorCode::Internal( + "Update does not support subquery temporarily", + )); + } Some(scalar) } else { None From fa279866f607fd27a8f9026b2c4e5f88fb4217ed Mon Sep 17 00:00:00 2001 From: zhyass Date: Sun, 15 Jan 2023 23:05:40 +0800 Subject: [PATCH 26/26] Add sql logic tests --- .../src/interpreters/interpreter_update.rs | 1 - .../storages/fuse/src/operations/update.rs | 3 + .../suites/base/03_dml/03_0035_update | 55 +++++++++++++++++-- 3 files changed, 52 insertions(+), 7 deletions(-) diff --git a/src/query/service/src/interpreters/interpreter_update.rs b/src/query/service/src/interpreters/interpreter_update.rs index 2ad62581340c..d9a7603f0da2 100644 --- a/src/query/service/src/interpreters/interpreter_update.rs +++ b/src/query/service/src/interpreters/interpreter_update.rs @@ -122,7 +122,6 @@ impl Interpreter for UpdateInterpreter { } } let right = right.ok_or_else(|| ErrorCode::Internal("It's a bug"))?; - println!("right: {:?}", right); let return_type = right.data_type(); Scalar::FunctionCall(FunctionCall { params: vec![], diff --git a/src/query/storages/fuse/src/operations/update.rs b/src/query/storages/fuse/src/operations/update.rs index 7a4aee68c6ff..1cc62a69898e 100644 --- a/src/query/storages/fuse/src/operations/update.rs +++ b/src/query/storages/fuse/src/operations/update.rs @@ -61,12 +61,15 @@ impl FuseTable { return Ok(()); } + let mut filter = filter; if col_indices.is_empty() && filter.is_some() { let filter_expr = filter.clone().unwrap(); if !self.try_eval_const(ctx.clone(), &self.schema(), &filter_expr)? { // The condition is always false, do nothing. return Ok(()); } + // The condition is always true. + filter = None; } self.try_add_update_source( diff --git a/tests/sqllogictests/suites/base/03_dml/03_0035_update b/tests/sqllogictests/suites/base/03_dml/03_0035_update index 3fccb9d507d2..ae0e732c46fc 100644 --- a/tests/sqllogictests/suites/base/03_dml/03_0035_update +++ b/tests/sqllogictests/suites/base/03_dml/03_0035_update @@ -8,25 +8,68 @@ statement ok USE db1 statement ok -CREATE TABLE IF NOT EXISTS t(a Int, b Date) +CREATE TABLE IF NOT EXISTS t1(a Int, b Date) statement ok -INSERT INTO t VALUES(1, '2022-12-30') +INSERT INTO t1 VALUES(1, '2022-12-30') statement ok -INSERT INTO t VALUES(2, '2023-01-01') +INSERT INTO t1 VALUES(2, '2023-01-01') statement ok -UPDATE t SET a = 3 WHERE b > '2022-12-31' +UPDATE t1 SET a = 3 WHERE b > '2022-12-31' query IT -SELECT * FROM t ORDER BY b +SELECT * FROM t1 ORDER BY b ---- 1 2022-12-30 3 2023-01-01 statement ok -drop table t all +UPDATE t1 SET a = 2, b = '2022-12-31' WHERE b > '2022-12-31' + +query IT +SELECT * FROM t1 ORDER BY b +---- +1 2022-12-30 +2 2022-12-31 + +statement ok +UPDATE t1 SET a = 3 WHERE false + +query B +select count(*) = 0 from t1 WHERE a = 3 +---- +1 + +statement ok +UPDATE t1 SET a = 3 WHERE true + +query B +select count(*) = 2 from t1 WHERE a = 3 +---- +1 + +statement error 1006 +UPDATE t1 SET a = 3, a = 4 WHERE b > '2022-12-31' + +statement ok +CREATE TABLE IF NOT EXISTS t2(a Int, b Date) + +statement ok +INSERT INTO t2 VALUES(1, '2022-12-30') + +statement ok +INSERT INTO t2 VALUES(2, '2023-01-01') + +statement error 1001 +UPDATE t1 SET a = 2 WHERE a in (SELECT a FROM t2 WHERE b > '2022-12-31') + +statement ok +drop table t1 all + +statement ok +drop table t2 all statement ok DROP DATABASE db1