From cd97a399041b316eb55fbaa7dd59802d71f7bc90 Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Mon, 3 Jun 2024 23:32:11 +0900 Subject: [PATCH 01/16] chore: enable `strip` for tests-fuzz crate (#4093) --- Cargo.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index e0fed99308a6..9623dfb90116 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -255,3 +255,7 @@ strip = true [profile.dev.package.sqlness-runner] debug = false strip = true + +[profile.dev.package.tests-fuzz] +debug = false +strip = true From d1838fb28de054c5f4c82ea07683f28acd4871db Mon Sep 17 00:00:00 2001 From: shuiyisong <113876041+shuiyisong@users.noreply.github.com> Date: Tue, 4 Jun 2024 11:29:15 +0800 Subject: [PATCH 02/16] refactor: move `define_into_tonic_status` to `common-error` (#4095) * chore: finish cherry-pick * chore: remove unused code --- Cargo.lock | 2 +- src/common/error/Cargo.toml | 1 + src/common/error/src/status_code.rs | 70 ++++++++++++++++++++++++++++ src/datanode/src/error.rs | 2 +- src/flow/src/adapter/error.rs | 2 +- src/frontend/src/error.rs | 2 +- src/meta-srv/src/error.rs | 2 +- src/operator/Cargo.toml | 1 - src/operator/src/error.rs | 2 +- src/servers/src/error.rs | 71 +---------------------------- 10 files changed, 78 insertions(+), 77 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 35fabee98b6c..18fa00347ff4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1794,6 +1794,7 @@ version = "0.8.1" dependencies = [ "snafu 0.8.3", "strum 0.25.0", + "tonic 0.11.0", ] [[package]] @@ -6713,7 +6714,6 @@ dependencies = [ "query", "regex", "serde_json", - "servers", "session", "snafu 0.8.3", "sql", diff --git a/src/common/error/Cargo.toml b/src/common/error/Cargo.toml index 92ab12dd07f5..49eafb81d5a2 100644 --- a/src/common/error/Cargo.toml +++ b/src/common/error/Cargo.toml @@ -10,3 +10,4 @@ workspace = true [dependencies] snafu.workspace = true strum.workspace = true +tonic.workspace = true diff --git a/src/common/error/src/status_code.rs b/src/common/error/src/status_code.rs index a9d61eed5688..fd519cc1e6e6 100644 --- a/src/common/error/src/status_code.rs +++ b/src/common/error/src/status_code.rs @@ -15,6 +15,7 @@ use std::fmt; use strum::{AsRefStr, EnumIter, EnumString, FromRepr}; +use tonic::Code; /// Common status code for public API. #[derive(Debug, Clone, Copy, PartialEq, Eq, EnumString, AsRefStr, EnumIter, FromRepr)] @@ -202,6 +203,75 @@ impl fmt::Display for StatusCode { } } +#[macro_export] +macro_rules! define_into_tonic_status { + ($Error: ty) => { + impl From<$Error> for tonic::Status { + fn from(err: $Error) -> Self { + use tonic::codegen::http::{HeaderMap, HeaderValue}; + use tonic::metadata::MetadataMap; + use $crate::GREPTIME_DB_HEADER_ERROR_CODE; + + let mut headers = HeaderMap::::with_capacity(2); + + // If either of the status_code or error msg cannot convert to valid HTTP header value + // (which is a very rare case), just ignore. Client will use Tonic status code and message. + let status_code = err.status_code(); + headers.insert( + GREPTIME_DB_HEADER_ERROR_CODE, + HeaderValue::from(status_code as u32), + ); + let root_error = err.output_msg(); + + let metadata = MetadataMap::from_headers(headers); + tonic::Status::with_metadata( + $crate::status_code::status_to_tonic_code(status_code), + root_error, + metadata, + ) + } + } + }; +} + +/// Returns the tonic [Code] of a [StatusCode]. +pub fn status_to_tonic_code(status_code: StatusCode) -> Code { + match status_code { + StatusCode::Success => Code::Ok, + StatusCode::Unknown => Code::Unknown, + StatusCode::Unsupported => Code::Unimplemented, + StatusCode::Unexpected + | StatusCode::Internal + | StatusCode::PlanQuery + | StatusCode::EngineExecuteQuery => Code::Internal, + StatusCode::InvalidArguments | StatusCode::InvalidSyntax | StatusCode::RequestOutdated => { + Code::InvalidArgument + } + StatusCode::Cancelled => Code::Cancelled, + StatusCode::TableAlreadyExists + | StatusCode::TableColumnExists + | StatusCode::RegionAlreadyExists + | StatusCode::FlowAlreadyExists => Code::AlreadyExists, + StatusCode::TableNotFound + | StatusCode::RegionNotFound + | StatusCode::TableColumnNotFound + | StatusCode::DatabaseNotFound + | StatusCode::UserNotFound + | StatusCode::FlowNotFound => Code::NotFound, + StatusCode::StorageUnavailable | StatusCode::RegionNotReady => Code::Unavailable, + StatusCode::RuntimeResourcesExhausted + | StatusCode::RateLimited + | StatusCode::RegionBusy => Code::ResourceExhausted, + StatusCode::UnsupportedPasswordType + | StatusCode::UserPasswordMismatch + | StatusCode::AuthHeaderNotFound + | StatusCode::InvalidAuthHeader => Code::Unauthenticated, + StatusCode::AccessDenied | StatusCode::PermissionDenied | StatusCode::RegionReadonly => { + Code::PermissionDenied + } + } +} + #[cfg(test)] mod tests { use strum::IntoEnumIterator; diff --git a/src/datanode/src/error.rs b/src/datanode/src/error.rs index 919a921ec349..f1a37f624997 100644 --- a/src/datanode/src/error.rs +++ b/src/datanode/src/error.rs @@ -15,10 +15,10 @@ use std::any::Any; use std::sync::Arc; +use common_error::define_into_tonic_status; use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; use common_macro::stack_trace_debug; -use servers::define_into_tonic_status; use snafu::{Location, Snafu}; use store_api::storage::RegionId; use table::error::Error as TableError; diff --git a/src/flow/src/adapter/error.rs b/src/flow/src/adapter/error.rs index 47df3d9014aa..9d5692aa1ab4 100644 --- a/src/flow/src/adapter/error.rs +++ b/src/flow/src/adapter/error.rs @@ -16,12 +16,12 @@ use std::any::Any; +use common_error::define_into_tonic_status; use common_error::ext::BoxedError; use common_macro::stack_trace_debug; use common_telemetry::common_error::ext::ErrorExt; use common_telemetry::common_error::status_code::StatusCode; use datatypes::value::Value; -use servers::define_into_tonic_status; use snafu::{Location, Snafu}; use crate::adapter::FlowId; diff --git a/src/frontend/src/error.rs b/src/frontend/src/error.rs index 9b2a0faf6320..e7b7a19885d7 100644 --- a/src/frontend/src/error.rs +++ b/src/frontend/src/error.rs @@ -15,10 +15,10 @@ use std::any::Any; use common_datasource::file_format::Format; +use common_error::define_into_tonic_status; use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; use common_macro::stack_trace_debug; -use servers::define_into_tonic_status; use snafu::{Location, Snafu}; use store_api::storage::RegionNumber; diff --git a/src/meta-srv/src/error.rs b/src/meta-srv/src/error.rs index e598a956d8dc..4490d840449b 100644 --- a/src/meta-srv/src/error.rs +++ b/src/meta-srv/src/error.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_error::define_into_tonic_status; use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; use common_macro::stack_trace_debug; @@ -19,7 +20,6 @@ use common_meta::peer::Peer; use common_meta::DatanodeId; use common_runtime::JoinError; use rand::distributions::WeightedError; -use servers::define_into_tonic_status; use snafu::{Location, Snafu}; use store_api::storage::RegionId; use table::metadata::TableId; diff --git a/src/operator/Cargo.toml b/src/operator/Cargo.toml index 8681dca2978e..dda47abebe89 100644 --- a/src/operator/Cargo.toml +++ b/src/operator/Cargo.toml @@ -47,7 +47,6 @@ prometheus.workspace = true query.workspace = true regex.workspace = true serde_json.workspace = true -servers.workspace = true session.workspace = true snafu.workspace = true sql.workspace = true diff --git a/src/operator/src/error.rs b/src/operator/src/error.rs index d7dcdb9d7057..f0266547951b 100644 --- a/src/operator/src/error.rs +++ b/src/operator/src/error.rs @@ -15,12 +15,12 @@ use std::any::Any; use common_datasource::file_format::Format; +use common_error::define_into_tonic_status; use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; use common_macro::stack_trace_debug; use datafusion::parquet; use datatypes::arrow::error::ArrowError; -use servers::define_into_tonic_status; use snafu::{Location, Snafu}; #[derive(Snafu)] diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs index 7515b767e235..ae595b8e95b6 100644 --- a/src/servers/src/error.rs +++ b/src/servers/src/error.rs @@ -21,6 +21,7 @@ use axum::response::{IntoResponse, Response}; use axum::{http, Json}; use base64::DecodeError; use catalog; +use common_error::define_into_tonic_status; use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; use common_macro::stack_trace_debug; @@ -29,7 +30,6 @@ use datatypes::prelude::ConcreteDataType; use query::parser::PromQuery; use serde_json::json; use snafu::{Location, Snafu}; -use tonic::Code; #[derive(Snafu)] #[snafu(visibility(pub))] @@ -695,75 +695,6 @@ impl ErrorExt for Error { } } -/// Returns the tonic [Code] of a [StatusCode]. -pub fn status_to_tonic_code(status_code: StatusCode) -> Code { - match status_code { - StatusCode::Success => Code::Ok, - StatusCode::Unknown => Code::Unknown, - StatusCode::Unsupported => Code::Unimplemented, - StatusCode::Unexpected - | StatusCode::Internal - | StatusCode::PlanQuery - | StatusCode::EngineExecuteQuery => Code::Internal, - StatusCode::InvalidArguments | StatusCode::InvalidSyntax | StatusCode::RequestOutdated => { - Code::InvalidArgument - } - StatusCode::Cancelled => Code::Cancelled, - StatusCode::TableAlreadyExists - | StatusCode::TableColumnExists - | StatusCode::RegionAlreadyExists - | StatusCode::FlowAlreadyExists => Code::AlreadyExists, - StatusCode::TableNotFound - | StatusCode::RegionNotFound - | StatusCode::TableColumnNotFound - | StatusCode::DatabaseNotFound - | StatusCode::UserNotFound - | StatusCode::FlowNotFound => Code::NotFound, - StatusCode::StorageUnavailable | StatusCode::RegionNotReady => Code::Unavailable, - StatusCode::RuntimeResourcesExhausted - | StatusCode::RateLimited - | StatusCode::RegionBusy => Code::ResourceExhausted, - StatusCode::UnsupportedPasswordType - | StatusCode::UserPasswordMismatch - | StatusCode::AuthHeaderNotFound - | StatusCode::InvalidAuthHeader => Code::Unauthenticated, - StatusCode::AccessDenied | StatusCode::PermissionDenied | StatusCode::RegionReadonly => { - Code::PermissionDenied - } - } -} - -#[macro_export] -macro_rules! define_into_tonic_status { - ($Error: ty) => { - impl From<$Error> for tonic::Status { - fn from(err: $Error) -> Self { - use tonic::codegen::http::{HeaderMap, HeaderValue}; - use tonic::metadata::MetadataMap; - use $crate::http::header::constants::GREPTIME_DB_HEADER_ERROR_CODE; - - let mut headers = HeaderMap::::with_capacity(2); - - // If either of the status_code or error msg cannot convert to valid HTTP header value - // (which is a very rare case), just ignore. Client will use Tonic status code and message. - let status_code = err.status_code(); - headers.insert( - GREPTIME_DB_HEADER_ERROR_CODE, - HeaderValue::from(status_code as u32), - ); - let root_error = err.output_msg(); - - let metadata = MetadataMap::from_headers(headers); - tonic::Status::with_metadata( - $crate::error::status_to_tonic_code(status_code), - root_error, - metadata, - ) - } - } - }; -} - define_into_tonic_status!(Error); impl From for Error { From 6a0998dfecbfbf7cbe345b728e42a8f17214c665 Mon Sep 17 00:00:00 2001 From: shuiyisong Date: Mon, 3 Jun 2024 17:18:56 +0800 Subject: [PATCH 03/16] refactor: remove servers dep on pipeline --- Cargo.lock | 1 - src/frontend/src/instance.rs | 2 - src/frontend/src/instance/builder.rs | 6 ++- src/frontend/src/pipeline.rs | 81 +++++++++------------------- src/operator/src/statement.rs | 2 + src/pipeline/Cargo.toml | 3 +- src/pipeline/src/table.rs | 29 ++++++---- 7 files changed, 51 insertions(+), 73 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9d53fa4ea3a7..f4256a6c40cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7296,7 +7296,6 @@ dependencies = [ "rayon", "ron", "serde", - "servers", "session", "snafu 0.8.3", "sql", diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs index ed6f52cb6387..47eabbe551fe 100644 --- a/src/frontend/src/instance.rs +++ b/src/frontend/src/instance.rs @@ -111,7 +111,6 @@ pub trait FrontendInstance: } pub type FrontendInstanceRef = Arc; -pub type StatementExecutorRef = Arc; #[derive(Clone)] pub struct Instance { @@ -267,7 +266,6 @@ impl FrontendInstance for Instance { } self.script_executor.start(self)?; - self.pipeline_operator.start(self); if let Some(t) = self.export_metrics_task.as_ref() { if t.send_by_handler { diff --git a/src/frontend/src/instance/builder.rs b/src/frontend/src/instance/builder.rs index 90634daf1523..2ad668ef1f19 100644 --- a/src/frontend/src/instance/builder.rs +++ b/src/frontend/src/instance/builder.rs @@ -27,7 +27,7 @@ use operator::delete::Deleter; use operator::insert::Inserter; use operator::procedure::ProcedureServiceOperator; use operator::request::Requester; -use operator::statement::StatementExecutor; +use operator::statement::{StatementExecutor, StatementExecutorRef}; use operator::table::TableMutationOperator; use partition::manager::PartitionRuleManager; use query::QueryEngineFactory; @@ -37,7 +37,7 @@ use snafu::OptionExt; use crate::error::{self, Result}; use crate::heartbeat::HeartbeatTask; use crate::instance::region_query::FrontendRegionQueryHandler; -use crate::instance::{Instance, StatementExecutorRef}; +use crate::instance::Instance; use crate::pipeline::PipelineOperator; use crate::script::ScriptExecutor; @@ -174,6 +174,8 @@ impl FrontendBuilder { )); let pipeline_operator = Arc::new(PipelineOperator::new( + inserter.clone(), + statement_executor.clone(), self.catalog_manager.clone(), query_engine.clone(), )); diff --git a/src/frontend/src/pipeline.rs b/src/frontend/src/pipeline.rs index 75f4e2c1c703..3dd107f7dcac 100644 --- a/src/frontend/src/pipeline.rs +++ b/src/frontend/src/pipeline.rs @@ -15,65 +15,38 @@ use std::collections::HashMap; use std::sync::{Arc, RwLock}; -use api::v1::ddl_request::Expr; -use api::v1::greptime_request::Request; -use api::v1::{CreateTableExpr, DdlRequest}; -use arc_swap::ArcSwap; +use api::v1::CreateTableExpr; use catalog::{CatalogManagerRef, RegisterSystemTableRequest}; use common_catalog::consts::{default_engine, DEFAULT_PRIVATE_SCHEMA_NAME}; use common_catalog::format_full_table_name; use common_error::ext::{BoxedError, ErrorExt}; -use common_query::Output; use common_telemetry::{error, info}; +use operator::insert::InserterRef; +use operator::statement::StatementExecutorRef; use pipeline::table::{PipelineTable, PipelineTableRef}; use pipeline::{GreptimeTransformer, Pipeline}; use query::QueryEngineRef; -use servers::query_handler::grpc::GrpcQueryHandler; use session::context::{QueryContext, QueryContextRef}; use snafu::{OptionExt, ResultExt}; use table::TableRef; use crate::error::{ - CatalogSnafu, Error, GetPipelineSnafu, InsertPipelineSnafu, Result, TableNotFoundSnafu, + CatalogSnafu, GetPipelineSnafu, InsertPipelineSnafu, Result, TableNotFoundSnafu, }; -use crate::instance::Instance; - -type FrontendGrpcQueryHandlerRef = Arc + Send + Sync>; pub const PIPELINE_TABLE_NAME: &str = "pipelines"; -struct DummyHandler; - -impl DummyHandler { - pub fn arc() -> Arc { - Arc::new(Self {}) - } -} - -#[async_trait::async_trait] -impl GrpcQueryHandler for DummyHandler { - type Error = Error; - - async fn do_query( - &self, - _query: Request, - _ctx: QueryContextRef, - ) -> std::result::Result { - unreachable!(); - } -} - pub struct PipelineOperator { - grpc_handler: ArcSwap, + inserter: InserterRef, + statement_executor: StatementExecutorRef, catalog_manager: CatalogManagerRef, query_engine: QueryEngineRef, - tables: RwLock>>, + tables: RwLock>, } impl PipelineOperator { pub fn create_table_request(&self, catalog: &str) -> RegisterSystemTableRequest { - let (time_index, primary_keys, column_defs) = - PipelineTable::::build_pipeline_schema(); + let (time_index, primary_keys, column_defs) = PipelineTable::build_pipeline_schema(); let create_table_expr = CreateTableExpr { catalog_name: catalog.to_string(), @@ -103,8 +76,9 @@ impl PipelineOperator { tables.insert( catalog.to_string(), Arc::new(PipelineTable::new( + self.inserter.clone(), + self.statement_executor.clone(), table, - self.grpc_handler.load().as_ref().clone(), self.query_engine.clone(), )), ); @@ -116,7 +90,7 @@ impl PipelineOperator { } let RegisterSystemTableRequest { - create_table_expr: expr, + create_table_expr: mut expr, open_hook, } = self.create_table_request(catalog); @@ -138,14 +112,11 @@ impl PipelineOperator { let schema = expr.schema_name.clone(); let table_name = expr.table_name.clone(); - let _ = self - .grpc_handler - .load() - .do_query( - Request::Ddl(DdlRequest { - expr: Some(Expr::CreateTable(expr)), - }), - QueryContext::arc(), + self.statement_executor + .create_table_inner( + &mut expr, + None, + Arc::new(QueryContext::with(catalog, &schema)), ) .await?; @@ -172,7 +143,7 @@ impl PipelineOperator { Ok(()) } - pub fn get_pipeline_table_from_cache(&self, catalog: &str) -> Option> { + pub fn get_pipeline_table_from_cache(&self, catalog: &str) -> Option { // FIXME (qtang): we should impl this self.tables.read().unwrap().get(catalog).cloned() } @@ -185,7 +156,7 @@ impl PipelineOperator { content_type: &str, pipeline: &str, ) -> Result<()> { - let _compiled_pipeline = PipelineTable::::compile_pipeline(pipeline) + let _compiled_pipeline = PipelineTable::compile_pipeline(pipeline) .map_err(BoxedError::new) .context(InsertPipelineSnafu { name })?; self.get_pipeline_table_from_cache(catalog) @@ -205,21 +176,21 @@ impl PipelineOperator { } impl PipelineOperator { - pub fn new(catalog_manager: CatalogManagerRef, query_engine: QueryEngineRef) -> Self { - let grpc_handler = ArcSwap::new(Arc::new(DummyHandler::arc() as _)); + pub fn new( + inserter: InserterRef, + statement_executor: StatementExecutorRef, + catalog_manager: CatalogManagerRef, + query_engine: QueryEngineRef, + ) -> Self { Self { - grpc_handler, + inserter, + statement_executor, catalog_manager, tables: RwLock::new(HashMap::new()), query_engine, } } - pub fn start(&self, instance: &Instance) { - self.grpc_handler - .store(Arc::new(Arc::new(instance.clone()) as _)); - } - pub async fn get_pipeline( &self, query_ctx: QueryContextRef, diff --git a/src/operator/src/statement.rs b/src/operator/src/statement.rs index 649af286a4bb..a7d170d75fd6 100644 --- a/src/operator/src/statement.rs +++ b/src/operator/src/statement.rs @@ -73,6 +73,8 @@ pub struct StatementExecutor { inserter: InserterRef, } +pub type StatementExecutorRef = Arc; + impl StatementExecutor { pub fn new( catalog_manager: CatalogManagerRef, diff --git a/src/pipeline/Cargo.toml b/src/pipeline/Cargo.toml index af282eb139d3..538428711a2c 100644 --- a/src/pipeline/Cargo.toml +++ b/src/pipeline/Cargo.toml @@ -36,11 +36,11 @@ datatypes.workspace = true futures.workspace = true lazy_static.workspace = true once_cell.workspace = true +operator.workspace = true paste = { workspace = true } pipeline = { git = "ssh://git@github.com/GreptimeTeam/pipeline.git", rev = "3eb890c551b8d7f60c4491fcfec18966e2b210a4" } prometheus.workspace = true query.workspace = true -servers.workspace = true session.workspace = true snafu.workspace = true sql.workspace = true @@ -52,7 +52,6 @@ catalog = { workspace = true, features = ["testing"] } common-test-util.workspace = true criterion = { version = "0.4", features = ["html_reports", "async_tokio"] } log-store.workspace = true -operator.workspace = true rayon = "1.0" ron = "0.7" serde = { version = "1.0", features = ["derive"] } diff --git a/src/pipeline/src/table.rs b/src/pipeline/src/table.rs index b6d0a31d5a39..681301c13650 100644 --- a/src/pipeline/src/table.rs +++ b/src/pipeline/src/table.rs @@ -1,13 +1,12 @@ use std::collections::HashMap; use std::sync::{Arc, RwLock}; -use api::v1::greptime_request::Request; use api::v1::value::ValueData; use api::v1::{ ColumnDataType, ColumnDef, ColumnSchema as PbColumnSchema, Row, RowInsertRequest, RowInsertRequests, Rows, SemanticType, }; -use common_error::ext::{BoxedError, ErrorExt, PlainError}; +use common_error::ext::{BoxedError, PlainError}; use common_error::status_code::StatusCode; use common_query::OutputData; use common_recordbatch::util as record_util; @@ -19,11 +18,12 @@ use datafusion_common::TableReference; use datafusion_expr::LogicalPlanBuilder; use datatypes::prelude::ScalarVector; use datatypes::vectors::{StringVector, Vector}; +use operator::insert::InserterRef; +use operator::statement::StatementExecutorRef; use pipeline::transform::GreptimeTransformer; use pipeline::{parse, Content, Pipeline}; use query::plan::LogicalPlan; use query::QueryEngineRef; -use servers::query_handler::grpc::GrpcQueryHandlerRef; use session::context::{QueryContextBuilder, QueryContextRef}; use snafu::{ensure, OptionExt, ResultExt}; use table::metadata::TableInfo; @@ -35,24 +35,27 @@ use crate::error::{ InsertPipelineSnafu, ParsePipelineSnafu, PipelineNotFoundSnafu, Result, }; -pub type PipelineTableRef = Arc>; +pub type PipelineTableRef = Arc; pub const PIPELINE_TABLE_NAME: &str = "pipelines"; -pub struct PipelineTable { - grpc_handler: GrpcQueryHandlerRef, +pub struct PipelineTable { + inserter: InserterRef, + statement_executor: StatementExecutorRef, table: TableRef, query_engine: QueryEngineRef, pipelines: RwLock>>, } -impl PipelineTable { +impl PipelineTable { pub fn new( + inserter: InserterRef, + statement_executor: StatementExecutorRef, table: TableRef, - grpc_handler: GrpcQueryHandlerRef, query_engine: QueryEngineRef, ) -> Self { Self { - grpc_handler, + inserter, + statement_executor, table, query_engine, pipelines: RwLock::new(HashMap::default()), @@ -238,8 +241,12 @@ impl PipelineTable { }; let output = self - .grpc_handler - .do_query(Request::RowInserts(requests), Self::query_ctx(&table_info)) + .inserter + .handle_row_inserts( + requests, + Self::query_ctx(&table_info), + &self.statement_executor, + ) .await .map_err(BoxedError::new) .context(InsertPipelineSnafu { name })?; From 443eaf9c831bf7c4a82c3f2c3e5258defb042f39 Mon Sep 17 00:00:00 2001 From: shuiyisong Date: Mon, 3 Jun 2024 17:54:09 +0800 Subject: [PATCH 04/16] refactor: move define_into_tonic_status to common-error --- Cargo.lock | 2 +- src/pipeline/src/lib.rs | 1 + src/servers/Cargo.toml | 2 +- src/servers/src/http/handler.rs | 2 +- src/servers/src/query_handler.rs | 3 +-- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f4256a6c40cd..09ed63c2c989 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9703,7 +9703,7 @@ dependencies = [ "permutation", "pgwire", "pin-project", - "pipeline 0.1.0", + "pipeline 0.8.1", "postgres-types", "pprof", "prometheus", diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs index e102ce489a3e..f5acf8d86af1 100644 --- a/src/pipeline/src/lib.rs +++ b/src/pipeline/src/lib.rs @@ -2,4 +2,5 @@ pub mod error; pub mod table; pub use pipeline::transform::GreptimeTransformer; +pub use pipeline::value::Value; pub use pipeline::Pipeline; diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml index a686ea29ebea..31321a84010c 100644 --- a/src/servers/Cargo.toml +++ b/src/servers/Cargo.toml @@ -69,7 +69,7 @@ opentelemetry-proto.workspace = true parking_lot = "0.12" pgwire = "0.20" pin-project = "1.0" -pipeline = { git = "ssh://git@github.com/GreptimeTeam/pipeline.git", rev = "3eb890c551b8d7f60c4491fcfec18966e2b210a4" } +pipeline.workspace = true postgres-types = { version = "0.2", features = ["with-chrono-0_4"] } pprof = { version = "0.13", features = [ "flamegraph", diff --git a/src/servers/src/http/handler.rs b/src/servers/src/http/handler.rs index 09c19c42260a..302c2186eb89 100644 --- a/src/servers/src/http/handler.rs +++ b/src/servers/src/http/handler.rs @@ -28,7 +28,7 @@ use common_plugins::GREPTIME_EXEC_WRITE_COST; use common_query::{Output, OutputData}; use common_recordbatch::util; use common_telemetry::tracing; -use pipeline::value::Value as PipelineValue; +use pipeline::Value as PipelineValue; use query::parser::{PromQuery, DEFAULT_LOOKBACK_STRING}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs index d7facac82e82..c2ef268d494f 100644 --- a/src/servers/src/query_handler.rs +++ b/src/servers/src/query_handler.rs @@ -35,8 +35,7 @@ use common_query::Output; use headers::HeaderValue; use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; -use pipeline::transform::GreptimeTransformer; -use pipeline::Pipeline; +use pipeline::{GreptimeTransformer, Pipeline}; use serde_json::Value; use session::context::QueryContextRef; From c8ce4ee5bfdb7cce6b7bf3736360d5aad474595c Mon Sep 17 00:00:00 2001 From: shuiyisong Date: Tue, 4 Jun 2024 15:34:21 +0800 Subject: [PATCH 05/16] refactor: bring in pipeline 3eb890c551b8d7f60c4491fcfec18966e2b210a4 --- Cargo.lock | 38 +- src/auth/src/permission.rs | 2 +- src/frontend/src/lib.rs | 2 +- src/pipeline/Cargo.toml | 28 +- src/pipeline/src/etl/field.rs | 195 ++++++++ src/pipeline/src/etl/mod.rs | 195 ++++++++ src/pipeline/src/etl/processor/cmcd.rs | 361 ++++++++++++++ src/pipeline/src/etl/processor/csv.rs | 327 +++++++++++++ src/pipeline/src/etl/processor/date.rs | 345 +++++++++++++ src/pipeline/src/etl/processor/epoch.rs | 205 ++++++++ src/pipeline/src/etl/processor/letter.rs | 188 +++++++ src/pipeline/src/etl/processor/mod.rs | 198 ++++++++ src/pipeline/src/etl/processor/regex.rs | 315 ++++++++++++ src/pipeline/src/etl/processor/urlencoding.rs | 177 +++++++ src/pipeline/src/etl/transform/index.rs | 57 +++ src/pipeline/src/etl/transform/mod.rs | 205 ++++++++ .../transform/transformer/greptime/coerce.rs | 310 ++++++++++++ .../etl/transform/transformer/greptime/mod.rs | 172 +++++++ .../src/etl/transform/transformer/mod.rs | 16 + .../src/etl/transform/transformer/noop.rs | 36 ++ src/pipeline/src/etl/value/array.rs | 56 +++ src/pipeline/src/etl/value/map.rs | 58 +++ src/pipeline/src/etl/value/mod.rs | 303 ++++++++++++ src/pipeline/src/etl/value/time.rs | 187 +++++++ src/pipeline/src/lib.rs | 11 +- src/pipeline/src/{ => mng}/error.rs | 0 src/pipeline/src/mng/mod.rs | 2 + src/pipeline/src/{ => mng}/table.rs | 4 +- src/pipeline/tests/pipeline.rs | 461 ++++++++++++++++++ src/script/src/table.rs | 2 +- src/servers/src/http/handler.rs | 4 +- 31 files changed, 4407 insertions(+), 53 deletions(-) create mode 100644 src/pipeline/src/etl/field.rs create mode 100644 src/pipeline/src/etl/mod.rs create mode 100644 src/pipeline/src/etl/processor/cmcd.rs create mode 100644 src/pipeline/src/etl/processor/csv.rs create mode 100644 src/pipeline/src/etl/processor/date.rs create mode 100644 src/pipeline/src/etl/processor/epoch.rs create mode 100644 src/pipeline/src/etl/processor/letter.rs create mode 100644 src/pipeline/src/etl/processor/mod.rs create mode 100644 src/pipeline/src/etl/processor/regex.rs create mode 100644 src/pipeline/src/etl/processor/urlencoding.rs create mode 100644 src/pipeline/src/etl/transform/index.rs create mode 100644 src/pipeline/src/etl/transform/mod.rs create mode 100644 src/pipeline/src/etl/transform/transformer/greptime/coerce.rs create mode 100644 src/pipeline/src/etl/transform/transformer/greptime/mod.rs create mode 100644 src/pipeline/src/etl/transform/transformer/mod.rs create mode 100644 src/pipeline/src/etl/transform/transformer/noop.rs create mode 100644 src/pipeline/src/etl/value/array.rs create mode 100644 src/pipeline/src/etl/value/map.rs create mode 100644 src/pipeline/src/etl/value/mod.rs create mode 100644 src/pipeline/src/etl/value/time.rs rename src/pipeline/src/{ => mng}/error.rs (100%) create mode 100644 src/pipeline/src/mng/mod.rs rename src/pipeline/src/{ => mng}/table.rs (99%) create mode 100644 src/pipeline/tests/pipeline.rs diff --git a/Cargo.lock b/Cargo.lock index 09ed63c2c989..6961827e690f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3933,7 +3933,7 @@ dependencies = [ "opentelemetry-proto 0.5.0", "operator", "partition", - "pipeline 0.8.1", + "pipeline", "prometheus", "prost 0.12.6", "query", @@ -7237,33 +7237,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" -[[package]] -name = "pipeline" -version = "0.1.0" -source = "git+ssh://git@github.com/GreptimeTeam/pipeline.git?rev=3eb890c551b8d7f60c4491fcfec18966e2b210a4#3eb890c551b8d7f60c4491fcfec18966e2b210a4" -dependencies = [ - "chrono", - "chrono-tz 0.9.0", - "csv", - "greptime-proto", - "itertools 0.12.1", - "lazy_static", - "log", - "regex", - "serde_json", - "urlencoding", - "yaml-rust", -] - [[package]] name = "pipeline" version = "0.8.1" dependencies = [ "api", - "arc-swap", "arrow", "async-trait", "catalog", + "chrono", + "chrono-tz 0.9.0", "common-catalog", "common-error", "common-function", @@ -7273,11 +7256,9 @@ dependencies = [ "common-recordbatch", "common-runtime", "common-telemetry", - "common-test-util", "common-time", - "console", - "criterion", "crossbeam-utils", + "csv", "datafusion 38.0.0", "datafusion-common 38.0.0", "datafusion-expr 38.0.0", @@ -7285,23 +7266,26 @@ dependencies = [ "datafusion-physical-expr 38.0.0", "datatypes", "futures", + "greptime-proto", + "itertools 0.10.5", "lazy_static", - "log-store", "once_cell", "operator", "paste", - "pipeline 0.1.0", "prometheus", "query", "rayon", + "regex", "ron", "serde", + "serde_json", "session", "snafu 0.8.3", "sql", "table", "tokio", - "tokio-test", + "urlencoding", + "yaml-rust", ] [[package]] @@ -9703,7 +9687,7 @@ dependencies = [ "permutation", "pgwire", "pin-project", - "pipeline 0.8.1", + "pipeline", "postgres-types", "pprof", "prometheus", diff --git a/src/auth/src/permission.rs b/src/auth/src/permission.rs index 272d59192fc5..57afda471c8b 100644 --- a/src/auth/src/permission.rs +++ b/src/auth/src/permission.rs @@ -30,7 +30,7 @@ pub enum PermissionReq<'a> { PromStoreWrite, PromStoreRead, Otlp, - LogWrite + LogWrite, } #[derive(Debug)] diff --git a/src/frontend/src/lib.rs b/src/frontend/src/lib.rs index 90a01c19ddfb..83fcbb3d09d2 100644 --- a/src/frontend/src/lib.rs +++ b/src/frontend/src/lib.rs @@ -19,7 +19,7 @@ pub mod frontend; pub mod heartbeat; pub mod instance; pub(crate) mod metrics; +mod pipeline; mod script; pub mod server; pub mod service_config; -mod pipeline; diff --git a/src/pipeline/Cargo.toml b/src/pipeline/Cargo.toml index 538428711a2c..f44c9825e63f 100644 --- a/src/pipeline/Cargo.toml +++ b/src/pipeline/Cargo.toml @@ -11,10 +11,11 @@ workspace = true [dependencies] api.workspace = true -arc-swap = "1.0" arrow.workspace = true async-trait.workspace = true catalog.workspace = true +chrono.workspace = true +chrono-tz = "0.9.0" common-catalog.workspace = true common-error.workspace = true common-function.workspace = true @@ -25,35 +26,36 @@ common-recordbatch.workspace = true common-runtime.workspace = true common-telemetry.workspace = true common-time.workspace = true -console = "0.15" crossbeam-utils.workspace = true -datafusion = { workspace = true } -datafusion-common = { workspace = true } -datafusion-expr = { workspace = true } -datafusion-functions = { workspace = true } -datafusion-physical-expr = { workspace = true } +csv = "1.3.0" +datafusion.workspace = true +datafusion-common.workspace = true +datafusion-expr.workspace = true +datafusion-functions.workspace = true +datafusion-physical-expr.workspace = true datatypes.workspace = true futures.workspace = true +greptime-proto.workspace = true +itertools.workspace = true lazy_static.workspace = true once_cell.workspace = true operator.workspace = true -paste = { workspace = true } -pipeline = { git = "ssh://git@github.com/GreptimeTeam/pipeline.git", rev = "3eb890c551b8d7f60c4491fcfec18966e2b210a4" } +paste.workspace = true prometheus.workspace = true query.workspace = true +regex.workspace = true +serde_json.workspace = true session.workspace = true snafu.workspace = true sql.workspace = true table.workspace = true tokio.workspace = true +urlencoding = "2.1" +yaml-rust = "0.4" [dev-dependencies] catalog = { workspace = true, features = ["testing"] } -common-test-util.workspace = true -criterion = { version = "0.4", features = ["html_reports", "async_tokio"] } -log-store.workspace = true rayon = "1.0" ron = "0.7" serde = { version = "1.0", features = ["derive"] } session = { workspace = true, features = ["testing"] } -tokio-test = "0.4" diff --git a/src/pipeline/src/etl/field.rs b/src/pipeline/src/etl/field.rs new file mode 100644 index 000000000000..9d76b540953f --- /dev/null +++ b/src/pipeline/src/etl/field.rs @@ -0,0 +1,195 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use itertools::Itertools; + +#[derive(Debug, Default, Clone)] +pub struct Fields(Vec); + +impl Fields { + pub(crate) fn new(fields: Vec) -> Result { + let ff = Fields(fields); + ff.check() + } + + pub(crate) fn one(field: Field) -> Self { + Fields(vec![field]) + } + + pub(crate) fn get_target_fields(&self) -> Vec<&str> { + self.0.iter().map(|f| f.get_target_field()).collect() + } + + fn check(self) -> Result { + if self.0.is_empty() { + return Err("fields must not be empty".to_string()); + } + + let mut set = std::collections::HashSet::new(); + for f in self.0.iter() { + if set.contains(&f.field) { + return Err(format!( + "field name must be unique, but got duplicated: {}", + f.field + )); + } + set.insert(&f.field); + } + + Ok(self) + } +} + +impl std::fmt::Display for Fields { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let s = self.0.iter().map(|f| f.to_string()).join(";"); + write!(f, "{s}") + } +} + +impl std::ops::Deref for Fields { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +#[derive(Debug, Default, Clone)] +pub struct Field { + pub field: String, + + // rename + pub target_field: Option, + + // 1-to-many mapping + // processors: + // - csv + pub target_fields: Option>, +} + +impl Field { + pub(crate) fn new(field: impl Into) -> Self { + Field { + field: field.into(), + target_field: None, + target_fields: None, + } + } + + // column_name in transform + pub(crate) fn get_target_field(&self) -> &str { + self.target_field.as_deref().unwrap_or(&self.field) + } + + pub(crate) fn get_field(&self) -> &str { + &self.field + } +} + +impl std::str::FromStr for Field { + type Err = String; + + fn from_str(s: &str) -> Result { + let mut parts = s.split(','); + let field = parts.next().ok_or("field is missing")?.trim().to_string(); + + if field.is_empty() { + return Err("field is empty".to_string()); + } + + let target_field = match parts.next() { + Some(s) if !s.trim().is_empty() => Some(s.trim().to_string()), + _ => None, + }; + + let fields: Vec<_> = parts + .filter(|s| !s.trim().is_empty()) + .map(|s| s.trim().to_string()) + .collect(); + let target_fields = if fields.is_empty() { + None + } else { + Some(fields) + }; + + Ok(Field { + field, + target_field, + target_fields, + }) + } +} + +impl std::fmt::Display for Field { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match (&self.target_field, &self.target_fields) { + (Some(target_field), None) => write!(f, "{}, {target_field}", self.field), + (None, Some(target_fields)) => { + write!(f, "{}, {}", self.field, target_fields.iter().join(",")) + } + _ => write!(f, "{}", self.field), + } + } +} + +#[cfg(test)] +mod tests { + use crate::etl::field::Field; + + #[test] + fn test_parse_field() { + let field: Result = " ".parse(); + assert!(field.is_err()); + + let field: Result = ",".parse(); + assert!(field.is_err()); + + let field: Result = ",field".parse(); + assert!(field.is_err()); + + let cases = [ + // ("field", "field", None, None), + ( + "field, target_field", + "field", + Some("target_field".into()), + None, + ), + ( + "field, target_field1, target_field2, target_field3", + "field", + Some("target_field1".into()), + Some(vec!["target_field2".into(), "target_field3".into()]), + ), + ( + "field,, target_field1, target_field2, target_field3", + "field", + None, + Some(vec![ + "target_field1".into(), + "target_field2".into(), + "target_field3".into(), + ]), + ), + ]; + + for (s, field, target_field, target_fields) in cases.into_iter() { + let f: Field = s.parse().unwrap(); + assert_eq!(f.get_field(), field, "{s}"); + assert_eq!(f.target_field, target_field, "{s}"); + assert_eq!(f.target_fields, target_fields, "{s}"); + } + } +} diff --git a/src/pipeline/src/etl/mod.rs b/src/pipeline/src/etl/mod.rs new file mode 100644 index 000000000000..74c6cd96c547 --- /dev/null +++ b/src/pipeline/src/etl/mod.rs @@ -0,0 +1,195 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#![allow(dead_code)] + +pub mod field; +pub mod processor; +pub mod transform; +pub mod value; + +use itertools::Itertools; +use transform::{Transformer, Transforms}; +use yaml_rust::YamlLoader; + +const DESCRIPTION: &str = "description"; +const PROCESSORS: &str = "processors"; +const TRANSFORM: &str = "transform"; + +pub enum Content { + Json(String), + Yaml(String), +} + +pub fn parse(input: &Content) -> Result, String> +where + T: Transformer, +{ + match input { + Content::Yaml(str) => { + let docs = YamlLoader::load_from_str(str).map_err(|e| e.to_string())?; + + let doc = &docs[0]; + + let description = doc[DESCRIPTION].as_str().map(|s| s.to_string()); + + let processors = if let Some(v) = doc[PROCESSORS].as_vec() { + v.try_into()? + } else { + processor::Processors::default() + }; + + let transforms = if let Some(v) = doc[TRANSFORM].as_vec() { + v.try_into()? + } else { + Transforms::default() + }; + + Ok(Pipeline { + description, + processors, + transformer: T::new(transforms)?, + }) + } + Content::Json(_) => unimplemented!(), + } +} + +#[derive(Debug, Clone)] +pub struct Pipeline +where + T: Transformer, +{ + description: Option, + processors: processor::Processors, + transformer: T, + // pub on_failure: processor::Processors, +} + +impl std::fmt::Display for Pipeline +where + T: Transformer, +{ + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + if let Some(description) = &self.description { + writeln!(f, "description: {description}")?; + } + + let processors = self.processors.iter().map(|p| p.kind()).join(","); + writeln!(f, "processors: {processors}")?; + + writeln!(f, "transformer: {}", self.transformer) + } +} + +impl Pipeline +where + T: Transformer, +{ + pub fn exec(&self, val: value::Value) -> Result { + let mut val = val; + for processor in self.processors.iter() { + val = processor.exec(val)?; + } + + self.transformer.transform(val) + } +} + +#[cfg(test)] +mod tests { + + use greptime_proto::v1::{self, ColumnDataType, SemanticType}; + + use crate::etl::transform::GreptimeTransformer; + use crate::etl::{parse, Content, Pipeline}; + + #[test] + fn test_csv_pipeline() { + let input_value_str = r#" + { + "my_field": "1,2", + "foo": "bar" + } + "#; + let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); + + let pipeline_yaml = r#" +--- +description: Pipeline for Apache Tomcat + +processors: + - csv: + field: my_field, field1, field2 + +transform: + - field: field1 + type: uint32 + - field: field2 + type: uint32 +"#; + + let pipeline: Pipeline = + parse(&Content::Yaml(pipeline_yaml.into())).unwrap(); + let output = pipeline.exec(input_value.try_into().unwrap()); + assert!(output.is_ok()); + } + + #[test] + fn test_date_pipeline() { + let input_value_str = r#" + { + "my_field": "1,2", + "foo": "bar", + "test_time": "2014-5-17T04:34:56+00:00" + } + "#; + let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); + + let pipeline_yaml = r#" +--- +description: Pipeline for Apache Tomcat + +processors: + - date: + field: test_time + +transform: + - field: test_time + type: time + index: timestamp +"#; + + let pipeline: Pipeline = + parse(&Content::Yaml(pipeline_yaml.into())).unwrap(); + let output = pipeline.exec(input_value.try_into().unwrap()).unwrap(); + let schemas = output.schema; + + assert_eq!(schemas.len(), 1); + let schema = schemas[0].clone(); + assert_eq!("test_time", schema.column_name); + assert_eq!(ColumnDataType::TimestampNanosecond as i32, schema.datatype); + assert_eq!(SemanticType::Timestamp as i32, schema.semantic_type); + + let row = output.rows[0].clone(); + assert_eq!(1, row.values.len()); + let value_data = row.values[0].clone().value_data; + assert_eq!( + Some(v1::value::ValueData::TimestampNanosecondValue( + 1400301296000000000 + )), + value_data + ); + } +} diff --git a/src/pipeline/src/etl/processor/cmcd.rs b/src/pipeline/src/etl/processor/cmcd.rs new file mode 100644 index 000000000000..7001ddb5b493 --- /dev/null +++ b/src/pipeline/src/etl/processor/cmcd.rs @@ -0,0 +1,361 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use urlencoding::decode; + +use crate::etl::field::{Field, Fields}; +use crate::etl::processor::{ + yaml_bool, yaml_field, yaml_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, +}; +use crate::etl::value::{Map, Value}; + +pub(crate) const PROCESSOR_CMCD: &str = "cmcd"; + +const CMCD_KEY_BR: &str = "br"; // Encoded bitrate, Integer kbps +const CMCD_KEY_BL: &str = "bl"; // Buffer length, Integer milliseconds +const CMCD_KEY_BS: &str = "bs"; // Buffer starvation, Boolean +const CMCD_KEY_CID: &str = "cid"; // Content ID, String +const CMCD_KEY_D: &str = "d"; // Object duration, Integer milliseconds +const CMCD_KEY_DL: &str = "dl"; // Deadline, Integer milliseconds +const CMCD_KEY_MTP: &str = "mtp"; // Measured throughput, Integer kbps +const CMCD_KEY_NOR: &str = "nor"; // Next object request, String +const CMCD_KEY_NRR: &str = "nrr"; // Next request range, String, "-" +const CMCD_KEY_OT: &str = "ot"; // Object type, Token - one of [m,a,v,av,i,c,tt,k,o] +const CMCD_KEY_PR: &str = "pr"; // Playback rate, Decimal +const CMCD_KEY_RTP: &str = "rtp"; // Requested maximum throughput, Integer kbps +const CMCD_KEY_SF: &str = "sf"; // Stall frequency, Token - one of [d,h,s,o] +const CMCD_KEY_SID: &str = "sid"; // Session ID, String +const CMCD_KEY_ST: &str = "st"; // Stream type, Token - one of [v,l] +const CMCD_KEY_SU: &str = "su"; // Startup, Boolean +const CMCD_KEY_TB: &str = "tb"; // Top bitrate, Integer kbps +const CMCD_KEY_V: &str = "v"; // Version + +/// Common Media Client Data Specification: +/// https://cdn.cta.tech/cta/media/media/resources/standards/pdfs/cta-5004-final.pdf +/// +/// +/// The data payload for Header and Query Argument transmission consists of a series of +/// key/value pairs constructed according to the following rules: +/// 1. All information in the payload MUST be represented as = pairs. +/// 2. The key and value MUST be separated by an equals sign Unicode 0x3D. If the +/// value type is BOOLEAN and the value is TRUE, then the equals sign and the value +/// MUST be omitted. +/// 3. Successive key/value pairs MUST be delimited by a comma Unicode 0x2C. +/// 4. The key names described in this specification are reserved. Custom key names +/// may be used, but they MUST carry a hyphenated prefix to ensure that there will +/// not be a namespace collision with future revisions to this specification. Clients +/// SHOULD use a reverse-DNS syntax when defining their own prefix. +/// 5. If headers are used for data transmission, then custom keys SHOULD be +/// allocated to one of the four defined header names based upon their expected +/// level of variability: +/// a. CMCD-Request: keys whose values vary with each request. +/// b. CMCD-Object: keys whose values vary with the object being requested. +/// c. CMCD-Status: keys whose values do not vary with every request or object. +/// d. CMCD-Session: keys whose values are expected to be invariant over the life of the session. +/// 6. All key names are case-sensitive. +/// 7. Any value of type String MUST be enclosed by opening and closing double +/// quotes Unicode 0x22. Double quotes and backslashes MUST be escaped using a +/// backslash "\" Unicode 0x5C character. Any value of type Token does not require +/// quoting. +/// 8. All keys are OPTIONAL. +/// 9. Key-value pairs SHOULD be sequenced in alphabetical order of the key name in +/// order to reduce the fingerprinting surface exposed by the player. +/// 10. If the data payload is transmitted as a query argument, then the entire payload +/// string MUST be URLEncoded per [5]. Data payloads transmitted via headers +/// MUST NOT be URLEncoded. +/// 11. The data payload syntax is intended to be compliant with Structured Field Values for HTTP [6]. +/// 12. Transport Layer Security SHOULD be used to protect all transmission of CMCD data. +#[derive(Debug, Default)] +pub struct CMCDProcessor { + fields: Fields, + + ignore_missing: bool, +} + +impl CMCDProcessor { + fn with_fields(&mut self, fields: Fields) { + self.fields = fields; + } + + fn with_ignore_missing(&mut self, ignore_missing: bool) { + self.ignore_missing = ignore_missing; + } + + fn parse(prefix: &str, s: &str) -> Result { + let mut map = Map::default(); + let parts = s.split(','); + for part in parts { + let mut kv = part.split('='); + let k = kv.next().ok_or(format!("{part} missing key in {s}"))?; + let v = kv.next(); + + let key = format!("{prefix}_{k}"); + match k { + CMCD_KEY_BS | CMCD_KEY_SU => { + map.insert(key, Value::Boolean(true)); + } + CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP + | CMCD_KEY_RTP | CMCD_KEY_TB => { + let v = v.ok_or(format!("{k} missing value in {s}"))?; + let val: i64 = v + .parse() + .map_err(|_| format!("failed to parse {v} as i64"))?; + map.insert(key, Value::Int64(val)); + } + CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID + | CMCD_KEY_ST | CMCD_KEY_V => { + let v = v.ok_or(format!("{k} missing value in {s}"))?; + map.insert(key, Value::String(v.to_string())); + } + CMCD_KEY_NOR => { + let v = v.ok_or(format!("{k} missing value in {s}"))?; + let val = match decode(v) { + Ok(val) => val.to_string(), + Err(_) => v.to_string(), + }; + map.insert(key, Value::String(val)); + } + CMCD_KEY_PR => { + let v = v.ok_or(format!("{k} missing value in {s}"))?; + let val: f64 = v + .parse() + .map_err(|_| format!("failed to parse {v} as f64"))?; + map.insert(key, Value::Float64(val)); + } + _ => match v { + Some(v) => map.insert(key, Value::String(v.to_string())), + None => map.insert(k, Value::Boolean(true)), + }, + } + } + + Ok(map) + } + + fn process_field(&self, val: &str, field: &Field) -> Result { + let prefix = match field.target_field { + Some(ref target_field) => target_field, + None => field.get_field(), + }; + + Self::parse(prefix, val) + } +} + +impl TryFrom<&yaml_rust::yaml::Hash> for CMCDProcessor { + type Error = String; + + fn try_from(value: &yaml_rust::yaml::Hash) -> Result { + let mut processor = CMCDProcessor::default(); + + for (k, v) in value.iter() { + let key = k + .as_str() + .ok_or(format!("key must be a string, but got {k:?}"))?; + match key { + FIELD_NAME => { + processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + } + FIELDS_NAME => { + processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + } + + IGNORE_MISSING_NAME => { + processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + } + + _ => {} + } + } + + Ok(processor) + } +} + +impl crate::etl::processor::Processor for CMCDProcessor { + fn kind(&self) -> &str { + PROCESSOR_CMCD + } + + fn ignore_missing(&self) -> bool { + self.ignore_missing + } + + fn fields(&self) -> &Fields { + &self.fields + } + + fn exec_field(&self, val: &Value, field: &Field) -> Result { + match val { + Value::String(val) => self.process_field(val, field), + _ => Err(format!( + "{} processor: expect string value, but got {val:?}", + self.kind() + )), + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use urlencoding::decode; + + use super::CMCDProcessor; + use crate::etl::value::{Map, Value}; + + #[test] + fn test_cmcd() { + let ss = [ + ( + "sid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22", + vec![( + "prefix_sid", + Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + )], + ), + ( + "br%3D3200%2Cbs%2Cd%3D4004%2Cmtp%3D25400%2Cot%3Dv%2Crtp%3D15000%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22%2Ctb%3D6000", + vec![ + ("prefix_bs", Value::Boolean(true)), + ("prefix_ot", Value::String("v".into())), + ("prefix_rtp", Value::Int64(15000)), + ("prefix_br", Value::Int64(3200)), + ("prefix_tb", Value::Int64(6000)), + ("prefix_d", Value::Int64(4004)), + ( + "prefix_sid", + Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + ), + ("prefix_mtp", Value::Int64(25400)), + ], + ), + ( + "b%2Crtp%3D15000%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22", + vec![ + ( + "prefix_sid", + Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + ), + ("prefix_rtp", Value::Int64(15000)), + ("b", Value::Boolean(true)), + ], + ), + ( + "bs%2Csu", + vec![ + ("prefix_su", Value::Boolean(true)), + ("prefix_bs", Value::Boolean(true)), + ], + ), + ( + "d%3D4004%2Ccom.example-myNumericKey%3D500%2Ccom.examplemyStringKey%3D%22myStringValue%22", + vec![ + ( + "prefix_com.example-myNumericKey", + Value::String("500".into()), + ), + ( + "prefix_com.examplemyStringKey", + Value::String("\"myStringValue\"".into()), + ), + ("prefix_d", Value::Int64(4004)), + ], + ), + ( + "nor%3D%22..%252F300kbps%252Fsegment35.m4v%22%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22", + vec![ + ( + "prefix_sid", + Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + ), + ( + "prefix_nor", + Value::String("\"../300kbps/segment35.m4v\"".into()), + + ), + ], + ), + ( + "nrr%3D%2212323-48763%22%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22", + vec![ + ("prefix_nrr", Value::String("\"12323-48763\"".into())), + ( + "prefix_sid", + Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + ), + ], + ), + ( + "nor%3D%22..%252F300kbps%252Ftrack.m4v%22%2Cnrr%3D%2212323-48763%22%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22", + vec![ + ("prefix_nrr", Value::String("\"12323-48763\"".into())), + ( + "prefix_sid", + Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + ), + ( + "prefix_nor", + Value::String("\"../300kbps/track.m4v\"".into()), + ), + ], + ), + ( + "bl%3D21300%2Cbr%3D3200%2Cbs%2Ccid%3D%22faec5fc2-ac30-11eabb37-0242ac130002%22%2Cd%3D4004%2Cdl%3D18500%2Cmtp%3D48100%2Cnor%3D%22..%252F300kbps%252Ftrack.m4v%22%2Cnrr%3D%2212323-48763%22%2Cot%3Dv%2Cpr%3D1.08%2Crtp%3D12000%2Csf%3Dd%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22%2Cst%3Dv%2Csu%2Ctb%3D6000", + vec![ + ("prefix_bl", Value::Int64(21300)), + ("prefix_bs", Value::Boolean(true)), + ("prefix_st", Value::String("v".into())), + ("prefix_ot", Value::String("v".into())), + ( + "prefix_sid", + Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + ), + ("prefix_tb", Value::Int64(6000)), + ("prefix_d", Value::Int64(4004)), + ( + "prefix_cid", + Value::String("\"faec5fc2-ac30-11eabb37-0242ac130002\"".into()), + ), + ("prefix_mtp", Value::Int64(48100)), + ("prefix_rtp", Value::Int64(12000)), + ( + "prefix_nor", + Value::String("\"../300kbps/track.m4v\"".into()), + ), + ("prefix_sf", Value::String("d".into())), + ("prefix_br", Value::Int64(3200)), + ("prefix_nrr", Value::String("\"12323-48763\"".into())), + ("prefix_pr", Value::Float64(1.08)), + ("prefix_su", Value::Boolean(true)), + ("prefix_dl", Value::Int64(18500)), + ], + ), + ]; + + for (s, vec) in ss.into_iter() { + let decoded = decode(s).unwrap().to_string(); + + let values = vec + .into_iter() + .map(|(k, v)| (k.to_string(), v)) + .collect::>(); + let expected = Map { values }; + + let actual = CMCDProcessor::parse("prefix", &decoded).unwrap(); + assert_eq!(actual, expected); + } + } +} diff --git a/src/pipeline/src/etl/processor/csv.rs b/src/pipeline/src/etl/processor/csv.rs new file mode 100644 index 000000000000..1cd110922892 --- /dev/null +++ b/src/pipeline/src/etl/processor/csv.rs @@ -0,0 +1,327 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/csv-processor.html + +use std::collections::HashMap; + +use csv::{ReaderBuilder, Trim}; +use itertools::EitherOrBoth::{Both, Left, Right}; +use itertools::Itertools; + +use crate::etl::field::{Field, Fields}; +use crate::etl::processor::{ + yaml_bool, yaml_field, yaml_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, +}; +use crate::etl::value::{Map, Value}; + +pub(crate) const PROCESSOR_CSV: &str = "csv"; + +const SEPARATOR_NAME: &str = "separator"; +const QUOTE_NAME: &str = "quote"; +const TRIM_NAME: &str = "trim"; +const EMPTY_VALUE_NAME: &str = "empty_value"; + +/// only support string value +#[derive(Debug)] +pub struct CsvProcessor { + reader: ReaderBuilder, + + fields: Fields, + + ignore_missing: bool, + + // Value used to fill empty fields, empty fields will be skipped if this is not provided. + empty_value: Option, + // description + // if + // ignore_failure + // on_failure + // tag +} + +impl CsvProcessor { + fn new() -> Self { + let mut reader = ReaderBuilder::new(); + reader.has_headers(false); + + Self { + reader, + fields: Fields::default(), + ignore_missing: false, + empty_value: None, + } + } + + fn with_fields(&mut self, fields: Fields) { + self.fields = fields; + } + + fn try_separator(&mut self, separator: String) -> Result<(), String> { + if separator.len() != 1 { + Err(format!( + "'{}' must be a single character, but got '{}'", + SEPARATOR_NAME, separator + )) + } else { + self.reader.delimiter(separator.as_bytes()[0]); + Ok(()) + } + } + + fn try_quote(&mut self, quote: String) -> Result<(), String> { + if quote.len() != 1 { + Err(format!( + "'{}' must be a single character, but got '{}'", + QUOTE_NAME, quote + )) + } else { + self.reader.quote(quote.as_bytes()[0]); + Ok(()) + } + } + + fn with_trim(&mut self, trim: bool) { + if trim { + self.reader.trim(Trim::All); + } else { + self.reader.trim(Trim::None); + } + } + + fn with_ignore_missing(&mut self, ignore_missing: bool) { + self.ignore_missing = ignore_missing; + } + + fn with_empty_value(&mut self, empty_value: String) { + self.empty_value = Some(empty_value); + } + + // process the csv format string to a map with target_fields as keys + fn process_field(&self, val: &str, field: &Field) -> Result { + let mut reader = self.reader.from_reader(val.as_bytes()); + + if let Some(result) = reader.records().next() { + let record: csv::StringRecord = result.map_err(|e| e.to_string())?; + + let values: HashMap = field + .target_fields + .as_ref() + .ok_or(format!( + "target fields must be set after '{}'", + field.get_field() + ))? + .iter() + .map(|f| f.to_string()) + .zip_longest(record.iter()) + .filter_map(|zipped| match zipped { + Both(target_field, val) => Some((target_field, Value::String(val.into()))), + // if target fields are more than extracted fields, fill the rest with empty value + Left(target_field) => { + let value = self + .empty_value + .as_ref() + .map(|s| Value::String(s.clone())) + .unwrap_or(Value::Null); + Some((target_field, value)) + } + // if extracted fields are more than target fields, ignore the rest + Right(_) => None, + }) + .collect(); + + Ok(Map { values }) + } else { + Err("expected at least one record from csv format, but got none".into()) + } + } +} + +impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessor { + type Error = String; + + fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { + let mut processor = CsvProcessor::new(); + for (k, v) in hash { + let key = k + .as_str() + .ok_or(format!("key must be a string, but got {k:?}"))?; + match key { + FIELD_NAME => { + processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + } + FIELDS_NAME => { + processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + } + SEPARATOR_NAME => { + processor.try_separator(yaml_string(v, SEPARATOR_NAME)?)?; + } + QUOTE_NAME => { + processor.try_quote(yaml_string(v, QUOTE_NAME)?)?; + } + TRIM_NAME => { + processor.with_trim(yaml_bool(v, TRIM_NAME)?); + } + IGNORE_MISSING_NAME => { + processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + } + EMPTY_VALUE_NAME => { + processor.with_empty_value(yaml_string(v, EMPTY_VALUE_NAME)?); + } + + _ => {} + } + } + + Ok(processor) + } +} + +impl Processor for CsvProcessor { + fn kind(&self) -> &str { + PROCESSOR_CSV + } + + fn ignore_missing(&self) -> bool { + self.ignore_missing + } + + fn fields(&self) -> &Fields { + &self.fields + } + + fn exec_field(&self, val: &Value, field: &Field) -> Result { + match val { + Value::String(val) => self.process_field(val, field), + _ => Err(format!( + "{} processor: expect string value, but got {val:?}", + self.kind() + )), + } + } +} + +// TODO(yuanbohan): more test cases +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use super::{CsvProcessor, Value}; + use crate::etl::field::Fields; + use crate::etl::processor::Processor; + use crate::etl::value::Map; + + #[test] + fn test_equal_length() { + let mut processor = CsvProcessor::new(); + let field = "data,, a, b".parse().unwrap(); + processor.with_fields(Fields::one(field)); + + let values: HashMap = [("data".into(), Value::String("1,2".into()))] + .into_iter() + .collect(); + + let result = processor.exec(Value::Map(Map { values })).unwrap(); + + let values = [ + ("data".into(), Value::String("1,2".into())), + ("a".into(), Value::String("1".into())), + ("b".into(), Value::String("2".into())), + ] + .into_iter() + .collect(); + let expected = Value::Map(Map { values }); + + assert_eq!(expected, result); + } + + // test target_fields length larger than the record length + #[test] + fn test_target_fields_has_more_length() { + let values = [("data".into(), Value::String("1,2".into()))] + .into_iter() + .collect(); + let input = Value::Map(Map { values }); + + // with no empty value + { + let mut processor = CsvProcessor::new(); + let field = "data,, a,b,c".parse().unwrap(); + processor.with_fields(Fields::one(field)); + + let result = processor.exec(input.clone()).unwrap(); + + let values = [ + ("data".into(), Value::String("1,2".into())), + ("a".into(), Value::String("1".into())), + ("b".into(), Value::String("2".into())), + ("c".into(), Value::Null), + ] + .into_iter() + .collect(); + let expected = Value::Map(Map { values }); + + assert_eq!(expected, result); + } + + // with empty value + { + let mut processor = CsvProcessor::new(); + let field = "data,, a,b,c".parse().unwrap(); + processor.with_fields(Fields::one(field)); + processor.with_empty_value("default".into()); + + let result = processor.exec(input).unwrap(); + + let values = [ + ("data".into(), Value::String("1,2".into())), + ("a".into(), Value::String("1".into())), + ("b".into(), Value::String("2".into())), + ("c".into(), Value::String("default".into())), + ] + .into_iter() + .collect(); + let expected = Value::Map(Map { values }); + + assert_eq!(expected, result); + } + } + + // test record has larger length + #[test] + fn test_target_fields_has_less_length() { + let values = [("data".into(), Value::String("1,2,3".into()))] + .into_iter() + .collect(); + let input = Value::Map(Map { values }); + + let mut processor = CsvProcessor::new(); + let field = "data,,a,b".parse().unwrap(); + processor.with_fields(Fields::one(field)); + + let result = processor.exec(input).unwrap(); + + let values = [ + ("data".into(), Value::String("1,2,3".into())), + ("a".into(), Value::String("1".into())), + ("b".into(), Value::String("2".into())), + ] + .into_iter() + .collect(); + let expected = Value::Map(Map { values }); + + assert_eq!(expected, result); + } +} diff --git a/src/pipeline/src/etl/processor/date.rs b/src/pipeline/src/etl/processor/date.rs new file mode 100644 index 000000000000..6715522793a7 --- /dev/null +++ b/src/pipeline/src/etl/processor/date.rs @@ -0,0 +1,345 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use chrono::{DateTime, NaiveDateTime}; +use chrono_tz::Tz; +use lazy_static::lazy_static; + +use crate::etl::field::{Field, Fields}; +use crate::etl::processor::{ + yaml_bool, yaml_field, yaml_fields, yaml_string, yaml_strings, Processor, FIELDS_NAME, + FIELD_NAME, IGNORE_MISSING_NAME, +}; +use crate::etl::value::{Map, Time, Value}; + +pub(crate) const PROCESSOR_DATE: &str = "date"; + +const FORMATS_NAME: &str = "formats"; // default RFC3339 +const TIMEZONE_NAME: &str = "timezone"; // default UTC +const LOCALE_NAME: &str = "locale"; +const OUTPUT_FORMAT_NAME: &str = "output_format"; // default with input format + +lazy_static! { + static ref DEFAULT_FORMATS: Vec = vec![ + // timezone with colon + "%Y-%m-%dT%H:%M:%S%:z", + "%Y-%m-%dT%H:%M:%S%.3f%:z", + "%Y-%m-%dT%H:%M:%S%.6f%:z", + "%Y-%m-%dT%H:%M:%S%.9f%:z", + // timezone without colon + "%Y-%m-%dT%H:%M:%S%z", + "%Y-%m-%dT%H:%M:%S%.3f%z", + "%Y-%m-%dT%H:%M:%S%.6f%z", + "%Y-%m-%dT%H:%M:%S%.9f%z", + // without timezone + "%Y-%m-%dT%H:%M:%SZ", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%H:%M:%S%.3f", + "%Y-%m-%dT%H:%M:%S%.6f", + "%Y-%m-%dT%H:%M:%S%.9f", + ] + .iter() + .map(|s| s.to_string()) + .collect(); +} + +#[derive(Debug, Default)] +struct Formats(Vec); + +impl Formats { + fn new(mut formats: Vec) -> Self { + formats.sort(); + formats.dedup(); + Formats(formats) + } +} + +impl std::ops::Deref for Formats { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +#[derive(Debug, Default)] +pub struct DateProcessor { + fields: Fields, + + formats: Formats, + timezone: Option, + locale: Option, // to support locale + output_format: Option, + + ignore_missing: bool, + // description + // if + // ignore_failure + // on_failure + // tag +} + +impl DateProcessor { + fn with_fields(&mut self, fields: Fields) { + self.fields = fields + } + + fn with_formats(&mut self, v: Option>) { + let v = match v { + Some(v) if !v.is_empty() => v, + _ => DEFAULT_FORMATS.clone(), + }; + + let formats = Formats::new(v); + self.formats = formats; + } + + fn with_timezone(&mut self, timezone: String) { + if !timezone.is_empty() { + self.timezone = Some(timezone); + } + } + + fn with_locale(&mut self, locale: String) { + if !locale.is_empty() { + self.locale = Some(locale); + } + } + + fn with_output_format(&mut self, output_format: String) { + if !output_format.is_empty() { + self.output_format = Some(output_format); + } + } + + fn with_ignore_missing(&mut self, ignore_missing: bool) { + self.ignore_missing = ignore_missing; + } + + fn parse(&self, val: &str) -> Result { + let mut tz = Tz::UTC; + if let Some(timezone) = &self.timezone { + tz = timezone.parse::().map_err(|e| e.to_string())?; + } + + for fmt in self.formats.iter() { + if let Ok(ns) = try_parse(val, fmt, tz) { + let mut t = Time::new(val, ns); + t.with_format(fmt); + t.with_timezone(self.timezone.clone()); + return Ok(t); + } + } + + Err(format!("{} processor: failed to parse {val}", self.kind(),)) + } + + fn process_field(&self, val: &str, field: &Field) -> Result { + let key = match field.target_field { + Some(ref target_field) => target_field, + None => field.get_field(), + }; + + Ok(Map::one(key, Value::Time(self.parse(val)?))) + } +} + +impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessor { + type Error = String; + + fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { + let mut processor = DateProcessor::default(); + + let mut formats_opt = None; + + for (k, v) in hash { + let key = k + .as_str() + .ok_or(format!("key must be a string, but got {k:?}"))?; + + match key { + FIELD_NAME => { + processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + } + FIELDS_NAME => { + processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + } + + FORMATS_NAME => { + let formats = yaml_strings(v, FORMATS_NAME)?; + formats_opt = Some(formats); + } + TIMEZONE_NAME => { + processor.with_timezone(yaml_string(v, TIMEZONE_NAME)?); + } + LOCALE_NAME => { + processor.with_locale(yaml_string(v, LOCALE_NAME)?); + } + OUTPUT_FORMAT_NAME => { + processor.with_output_format(yaml_string(v, OUTPUT_FORMAT_NAME)?); + } + + IGNORE_MISSING_NAME => { + processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + } + + _ => {} + } + } + + processor.with_formats(formats_opt); + + Ok(processor) + } +} + +impl Processor for DateProcessor { + fn kind(&self) -> &str { + PROCESSOR_DATE + } + + fn ignore_missing(&self) -> bool { + self.ignore_missing + } + + fn fields(&self) -> &Fields { + &self.fields + } + + fn exec_field(&self, val: &Value, field: &Field) -> Result { + match val { + Value::String(s) => self.process_field(s, field), + _ => Err(format!( + "{} processor: expect string value, but got {val:?}", + self.kind() + )), + } + } +} + +/// try to parse val with timezone first, if failed, parse without timezone +fn try_parse(val: &str, fmt: &str, tz: Tz) -> Result { + if let Ok(dt) = DateTime::parse_from_str(val, fmt) { + Ok(dt.timestamp_nanos_opt().ok_or("failed to get timestamp")?) + } else { + let dt = NaiveDateTime::parse_from_str(val, fmt) + .map_err(|e| e.to_string())? + .and_local_timezone(tz) + .single() + .ok_or("failed to get local timezone")?; + Ok(dt.timestamp_nanos_opt().ok_or("failed to get timestamp")?) + } +} + +#[cfg(test)] +mod tests { + use chrono_tz::Asia::Tokyo; + + use crate::etl::processor::date::{try_parse, DateProcessor}; + + #[test] + fn test_try_parse() { + let time_with_tz = "2014-5-17T04:34:56+00:00"; + let fmt_with_tz = "%Y-%m-%dT%H:%M:%S%:z"; + + let time_without_tz = "2014-5-17T13:34:56"; + let fmt_without_tz = "%Y-%m-%dT%H:%M:%S"; + + let tz = Tokyo; + + let parsed_with_tz = try_parse(time_with_tz, fmt_with_tz, tz); + assert!(parsed_with_tz.is_ok()); + + let parsed_without_tz = try_parse(time_without_tz, fmt_without_tz, tz); + assert!(parsed_without_tz.is_ok()); + + assert_eq!(parsed_with_tz.unwrap(), parsed_without_tz.unwrap()); + } + + #[test] + fn test_parse() { + let mut processor = DateProcessor::default(); + processor.with_formats(None); + + let values: Vec<&str> = vec![ + "2014-5-17T12:34:56", + "2014-5-17T12:34:56Z", + "2014-5-17T12:34:56+09:30", + "2014-5-17T12:34:56.000+09:30", + "2014-5-17T12:34:56-0930", + "2014-5-17T12:34:56.000-0930", + ] + .into_iter() + .collect(); + + for value in values { + let parsed = processor.parse(value); + assert!(parsed.is_ok()); + } + } + + #[test] + fn test_parse_with_formats() { + let mut processor = DateProcessor::default(); + let formats = vec![ + "%Y-%m-%dT%H:%M:%S%:z", + "%Y-%m-%dT%H:%M:%S%.3f%:z", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%H:%M:%SZ", + ] + .into_iter() + .map(|s| s.to_string()) + .collect(); + processor.with_formats(Some(formats)); + + let values: Vec<&str> = vec![ + "2014-5-17T12:34:56", + "2014-5-17T12:34:56Z", + "2014-5-17T12:34:56+09:30", + "2014-5-17T12:34:56.000+09:30", + "2014-5-17T12:34:56-0930", + "2014-5-17T12:34:56.000-0930", + ] + .into_iter() + .collect(); + + for value in values { + let parsed = processor.parse(value); + assert!(parsed.is_ok()); + } + } + + #[test] + fn test_parse_with_timezone() { + let mut processor = DateProcessor::default(); + processor.with_formats(None); + processor.with_timezone("Asia/Tokyo".to_string()); + + let values: Vec<&str> = vec![ + "2014-5-17T12:34:56", + "2014-5-17T12:34:56Z", + "2014-5-17T12:34:56+09:30", + "2014-5-17T12:34:56.000+09:30", + "2014-5-17T12:34:56-0930", + "2014-5-17T12:34:56.000-0930", + ] + .into_iter() + .collect(); + + for value in values { + let parsed = processor.parse(value); + assert!(parsed.is_ok()); + } + } +} diff --git a/src/pipeline/src/etl/processor/epoch.rs b/src/pipeline/src/etl/processor/epoch.rs new file mode 100644 index 000000000000..feee2fa8d717 --- /dev/null +++ b/src/pipeline/src/etl/processor/epoch.rs @@ -0,0 +1,205 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::etl::field::{Field, Fields}; +use crate::etl::processor::{ + yaml_bool, yaml_field, yaml_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, +}; +use crate::etl::value::time::{ + MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION, + MS_RESOLUTION, NANOSECOND_RESOLUTION, NANO_RESOLUTION, NS_RESOLUTION, SECOND_RESOLUTION, + SEC_RESOLUTION, S_RESOLUTION, US_RESOLUTION, +}; +use crate::etl::value::{Epoch, Map, Value}; + +pub(crate) const PROCESSOR_EPOCH: &str = "epoch"; +const RESOLUTION_NAME: &str = "resolution"; + +#[derive(Debug, Default)] +enum Resolution { + Second, + #[default] + Milli, + Micro, + Nano, +} + +impl TryFrom<&str> for Resolution { + type Error = String; + + fn try_from(s: &str) -> Result { + match s { + SECOND_RESOLUTION | SEC_RESOLUTION | S_RESOLUTION => Ok(Resolution::Second), + MILLISECOND_RESOLUTION | MILLI_RESOLUTION | MS_RESOLUTION => Ok(Resolution::Milli), + MICROSECOND_RESOLUTION | MICRO_RESOLUTION | US_RESOLUTION => Ok(Resolution::Micro), + NANOSECOND_RESOLUTION | NANO_RESOLUTION | NS_RESOLUTION => Ok(Resolution::Nano), + _ => Err(format!("invalid resolution: {s}")), + } + } +} + +/// support string, integer, float, time, epoch +#[derive(Debug, Default)] +pub struct EpochProcessor { + fields: Fields, + resolution: Resolution, + ignore_missing: bool, + // description + // if + // ignore_failure + // on_failure + // tag +} + +impl EpochProcessor { + fn with_fields(&mut self, fields: Fields) { + self.fields = fields + } + + fn with_resolution(&mut self, resolution: Resolution) { + self.resolution = resolution; + } + + fn with_ignore_missing(&mut self, ignore_missing: bool) { + self.ignore_missing = ignore_missing; + } + + fn parse(&self, val: &Value) -> Result { + let t: i64 = match val { + Value::String(s) => s.parse::().map_err(|e| e.to_string())?, + Value::Int16(i) => *i as i64, + Value::Int32(i) => *i as i64, + Value::Int64(i) => *i, + Value::Uint8(i) => *i as i64, + Value::Uint16(i) => *i as i64, + Value::Uint32(i) => *i as i64, + Value::Uint64(i) => *i as i64, + Value::Float32(f) => *f as i64, + Value::Float64(f) => *f as i64, + + Value::Time(t) => match self.resolution { + Resolution::Second => t.timestamp(), + Resolution::Milli => t.timestamp_millis(), + Resolution::Micro => t.timestamp_micros(), + Resolution::Nano => t.timestamp_nanos(), + }, + + Value::Epoch(e) => match self.resolution { + Resolution::Second => e.timestamp(), + Resolution::Milli => e.timestamp_millis(), + Resolution::Micro => e.timestamp_micros(), + Resolution::Nano => e.timestamp_nanos(), + }, + + _ => { + return Err(format!( + "{PROCESSOR_EPOCH} processor: unsupported value {val}" + )) + } + }; + + match self.resolution { + Resolution::Second => Ok(Epoch::Second(t)), + Resolution::Milli => Ok(Epoch::Millisecond(t)), + Resolution::Micro => Ok(Epoch::Microsecond(t)), + Resolution::Nano => Ok(Epoch::Nanosecond(t)), + } + } + + fn process_field(&self, val: &Value, field: &Field) -> Result { + let key = match field.target_field { + Some(ref target_field) => target_field, + None => field.get_field(), + }; + + Ok(Map::one(key, Value::Epoch(self.parse(val)?))) + } +} + +impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor { + type Error = String; + + fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { + let mut processor = EpochProcessor::default(); + + for (k, v) in hash { + let key = k + .as_str() + .ok_or(format!("key must be a string, but got {k:?}"))?; + + match key { + FIELD_NAME => { + processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + } + FIELDS_NAME => { + processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + } + RESOLUTION_NAME => { + let s = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?; + processor.with_resolution(s); + } + IGNORE_MISSING_NAME => { + processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + } + + _ => {} + } + } + + Ok(processor) + } +} + +impl Processor for EpochProcessor { + fn kind(&self) -> &str { + PROCESSOR_EPOCH + } + + fn ignore_missing(&self) -> bool { + self.ignore_missing + } + + fn fields(&self) -> &Fields { + &self.fields + } + + fn exec_field(&self, val: &Value, field: &Field) -> Result { + self.process_field(val, field) + } +} + +#[cfg(test)] +mod tests { + use super::EpochProcessor; + use crate::etl::value::Value; + + #[test] + fn test_parse_epoch() { + let mut processor = EpochProcessor::default(); + processor.with_resolution(super::Resolution::Second); + + let values = [ + Value::String("1573840000".into()), + Value::Int32(1573840000), + Value::Uint64(1573840000), + Value::Float32(1573840000.0), + ]; + + for value in values { + let parsed = processor.parse(&value).unwrap(); + assert_eq!(parsed, super::Epoch::Second(1573840000)); + } + } +} diff --git a/src/pipeline/src/etl/processor/letter.rs b/src/pipeline/src/etl/processor/letter.rs new file mode 100644 index 000000000000..1c2fcf9eacfc --- /dev/null +++ b/src/pipeline/src/etl/processor/letter.rs @@ -0,0 +1,188 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::etl::field::{Field, Fields}; +use crate::etl::processor::{ + yaml_bool, yaml_field, yaml_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, METHOD_NAME, +}; +use crate::etl::value::{Map, Value}; + +pub(crate) const PROCESSOR_LETTER: &str = "letter"; + +#[derive(Debug, Default)] +enum Method { + Upper, + #[default] + Lower, + Capital, +} + +impl std::fmt::Display for Method { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Method::Upper => write!(f, "upper"), + Method::Lower => write!(f, "lower"), + Method::Capital => write!(f, "capital"), + } + } +} + +impl std::str::FromStr for Method { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "upper" => Ok(Method::Upper), + "lower" => Ok(Method::Lower), + "capital" => Ok(Method::Capital), + _ => Err(format!("invalid method: {s}")), + } + } +} + +/// only support string value +#[derive(Debug, Default)] +pub struct LetterProcessor { + fields: Fields, + method: Method, + ignore_missing: bool, +} + +impl LetterProcessor { + fn with_fields(&mut self, fields: Fields) { + self.fields = fields; + } + + fn with_method(&mut self, method: Method) { + self.method = method; + } + + fn with_ignore_missing(&mut self, ignore_missing: bool) { + self.ignore_missing = ignore_missing; + } + + fn process_field(&self, val: &str, field: &Field) -> Result { + let processed = match self.method { + Method::Upper => val.to_uppercase(), + Method::Lower => val.to_lowercase(), + Method::Capital => capitalize(val), + }; + let val = Value::String(processed); + + let key = match field.target_field { + Some(ref target_field) => target_field, + None => field.get_field(), + }; + + Ok(Map::one(key, val)) + } +} + +impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor { + type Error = String; + + fn try_from(value: &yaml_rust::yaml::Hash) -> Result { + let mut processor = LetterProcessor::default(); + + for (k, v) in value.iter() { + let key = k + .as_str() + .ok_or(format!("key must be a string, but got {k:?}"))?; + match key { + FIELD_NAME => { + processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + } + FIELDS_NAME => { + processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + } + METHOD_NAME => { + let method = yaml_string(v, METHOD_NAME)?; + processor.with_method(method.parse()?); + } + IGNORE_MISSING_NAME => { + processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + } + _ => {} + } + } + + Ok(processor) + } +} + +impl Processor for LetterProcessor { + fn kind(&self) -> &str { + PROCESSOR_LETTER + } + + fn ignore_missing(&self) -> bool { + self.ignore_missing + } + + fn fields(&self) -> &Fields { + &self.fields + } + + fn exec_field(&self, val: &Value, field: &Field) -> Result { + match val { + Value::String(val) => self.process_field(val, field), + _ => Err(format!( + "{} processor: expect string value, but got {val:?}", + self.kind() + )), + } + } +} + +fn capitalize(s: &str) -> String { + let mut c = s.chars(); + match c.next() { + None => String::new(), + Some(f) => f.to_uppercase().collect::() + c.as_str(), + } +} + +#[cfg(test)] +mod tests { + use crate::etl::field::Fields; + use crate::etl::processor::letter::{LetterProcessor, Method}; + use crate::etl::value::{Map, Value}; + + #[test] + fn test_process() { + let field = "letter"; + let ff: crate::etl::processor::Field = field.parse().unwrap(); + let mut processor = LetterProcessor::default(); + processor.with_fields(Fields::one(ff.clone())); + + { + processor.with_method(Method::Upper); + let processed = processor.process_field("pipeline", &ff).unwrap(); + assert_eq!(Map::one(field, Value::String("PIPELINE".into())), processed) + } + + { + processor.with_method(Method::Lower); + let processed = processor.process_field("Pipeline", &ff).unwrap(); + assert_eq!(Map::one(field, Value::String("pipeline".into())), processed) + } + + { + processor.with_method(Method::Capital); + let processed = processor.process_field("pipeline", &ff).unwrap(); + assert_eq!(Map::one(field, Value::String("Pipeline".into())), processed) + } + } +} diff --git a/src/pipeline/src/etl/processor/mod.rs b/src/pipeline/src/etl/processor/mod.rs new file mode 100644 index 000000000000..c04414b87cbd --- /dev/null +++ b/src/pipeline/src/etl/processor/mod.rs @@ -0,0 +1,198 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod cmcd; +pub mod csv; +pub mod date; +pub mod epoch; +pub mod letter; +pub mod regex; +pub mod urlencoding; + +use std::sync::Arc; + +use cmcd::CMCDProcessor; +use common_telemetry::warn; +use csv::CsvProcessor; +use date::DateProcessor; +use epoch::EpochProcessor; +use letter::LetterProcessor; +use regex::RegexProcessor; +use urlencoding::UrlEncodingProcessor; + +use crate::etl::field::{Field, Fields}; +use crate::etl::value::{Array, Map, Value}; + +const FIELD_NAME: &str = "field"; +const FIELDS_NAME: &str = "fields"; +const IGNORE_MISSING_NAME: &str = "ignore_missing"; +const METHOD_NAME: &str = "method"; + +// const IF_NAME: &str = "if"; +// const IGNORE_FAILURE_NAME: &str = "ignore_failure"; +// const ON_FAILURE_NAME: &str = "on_failure"; +// const TAG_NAME: &str = "tag"; + +pub trait Processor: std::fmt::Debug + Send + Sync + 'static { + fn fields(&self) -> &Fields; + fn kind(&self) -> &str; + fn ignore_missing(&self) -> bool; + + fn ignore_processor_array_failure(&self) -> bool { + true + } + + fn exec_field(&self, _val: &Value, _field: &Field) -> Result { + Ok(Map::default()) + } + + fn exec_map(&self, mut map: Map) -> Result { + for ff @ Field { field, .. } in self.fields().iter() { + let val = map.get(field); + match val { + Some(v) => { + map.extend(self.exec_field(v, ff)?); + } + None if self.ignore_missing() => {} + None => { + return Err(format!( + "{} processor: field '{field}' is required but missing in {map}", + self.kind(), + )) + } + } + } + + Ok(Value::Map(map)) + } + + fn exec_array(&self, arr: Array) -> Result { + let mut values = vec![]; + for val in arr.into_iter() { + match val { + Value::Map(map) => { + values.push(self.exec_map(map)?); + } + _ if self.ignore_processor_array_failure() => { + warn!("expected a map, but got {val}") + } + _ => return Err(format!("expected a map, but got {}", val)), + } + } + + Ok(Value::Array(Array { values })) + } + + fn exec(&self, val: Value) -> Result { + match val { + Value::Map(map) => self.exec_map(map), + Value::Array(arr) => self.exec_array(arr), + _ => Err(format!("expected a map or array, but got {}", val)), + } + } +} + +#[derive(Debug, Default, Clone)] +pub struct Processors { + pub processors: Vec>, +} + +impl Processors { + pub fn new() -> Self { + Processors { processors: vec![] } + } +} + +impl std::ops::Deref for Processors { + type Target = Vec>; + + fn deref(&self) -> &Self::Target { + &self.processors + } +} + +impl TryFrom<&Vec> for Processors { + type Error = String; + + fn try_from(vec: &Vec) -> Result { + let mut processors = vec![]; + + for doc in vec { + processors.push(parse_processor(doc)?); + } + + Ok(Processors { processors }) + } +} + +fn parse_processor(doc: &yaml_rust::Yaml) -> Result, String> { + let map = doc.as_hash().ok_or("processor must be a map".to_string())?; + + let key = map + .keys() + .next() + .ok_or("processor must have a string key".to_string())?; + + let value = map + .get(key) + .unwrap() + .as_hash() + .expect("processor value must be a map"); + + let str_key = key + .as_str() + .ok_or("processor key must be a string".to_string())?; + + let processor: Arc = match str_key { + cmcd::PROCESSOR_CMCD => Arc::new(CMCDProcessor::try_from(value)?), + csv::PROCESSOR_CSV => Arc::new(CsvProcessor::try_from(value)?), + date::PROCESSOR_DATE => Arc::new(DateProcessor::try_from(value)?), + epoch::PROCESSOR_EPOCH => Arc::new(EpochProcessor::try_from(value)?), + letter::PROCESSOR_LETTER => Arc::new(LetterProcessor::try_from(value)?), + regex::PROCESSOR_REGEX => Arc::new(RegexProcessor::try_from(value)?), + urlencoding::PROCESSOR_URL_ENCODING => Arc::new(UrlEncodingProcessor::try_from(value)?), + _ => return Err(format!("unsupported {} processor", str_key)), + }; + + Ok(processor) +} + +pub(crate) fn yaml_string(v: &yaml_rust::Yaml, field: &str) -> Result { + v.as_str() + .map(|s| s.trim().to_string()) + .ok_or(format!("'{field}' must be a string")) +} + +pub(crate) fn yaml_strings(v: &yaml_rust::Yaml, field: &str) -> Result, String> { + let vec = v + .as_vec() + .ok_or(format!("'{field}' must be a list of strings",))? + .iter() + .map(|v| v.as_str().unwrap_or_default().into()) + .collect(); + Ok(vec) +} + +pub(crate) fn yaml_bool(v: &yaml_rust::Yaml, field: &str) -> Result { + v.as_bool().ok_or(format!("'{field}' must be a boolean")) +} + +pub(crate) fn yaml_field(v: &yaml_rust::Yaml, field: &str) -> Result { + yaml_string(v, field)?.parse() +} + +pub(crate) fn yaml_fields(v: &yaml_rust::Yaml, field: &str) -> Result { + let ff = yaml_strings(v, field).and_then(|v| v.into_iter().map(|s| s.parse()).collect())?; + Fields::new(ff) +} diff --git a/src/pipeline/src/etl/processor/regex.rs b/src/pipeline/src/etl/processor/regex.rs new file mode 100644 index 000000000000..078deef603b5 --- /dev/null +++ b/src/pipeline/src/etl/processor/regex.rs @@ -0,0 +1,315 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// field_name and prefix with comma separated, like: +// name, new_name +const PATTERNS_NAME: &str = "patterns"; + +pub(crate) const PROCESSOR_REGEX: &str = "regex"; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::etl::field::Fields; +use crate::etl::processor::{ + yaml_bool, yaml_field, yaml_fields, yaml_strings, Field, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, +}; +use crate::etl::value::{Map, Value}; + +lazy_static! { + static ref GROUPS_NAME_REGEX: Regex = Regex::new(r"\(\?P?<([[:word:]]+)>.+?\)").unwrap(); +} + +fn get_regex_group_names(s: &str) -> Vec { + GROUPS_NAME_REGEX + .captures_iter(s) + .filter_map(|c| c.get(1).map(|m| m.as_str().to_string())) + .collect() +} + +#[derive(Debug)] +struct GroupRegex { + origin: String, + regex: Regex, + groups: Vec, +} + +impl std::fmt::Display for GroupRegex { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let groups = self.groups.join(", "); + write!(f, "{}, groups: [{groups}]", self.origin) + } +} + +impl std::str::FromStr for GroupRegex { + type Err = String; + + fn from_str(origin: &str) -> Result { + let groups = get_regex_group_names(origin); + if groups.is_empty() { + return Err(format!("no named group found in regex {origin}")); + } + + let regex = Regex::new(origin).map_err(|e| e.to_string())?; + Ok(GroupRegex { + origin: origin.into(), + regex, + groups, + }) + } +} + +/// only support string value +/// if no value found from a pattern, the target_field will be ignored +#[derive(Debug, Default)] +pub struct RegexProcessor { + fields: Fields, + patterns: Vec, + ignore_missing: bool, +} + +impl RegexProcessor { + fn with_fields(&mut self, fields: Fields) { + self.fields = fields; + } + + fn try_with_patterns(&mut self, patterns: Vec) -> Result<(), String> { + let mut rs = vec![]; + for pattern in patterns { + let gr = pattern.parse()?; + rs.push(gr); + } + self.patterns = rs; + Ok(()) + } + + fn with_ignore_missing(&mut self, ignore_missing: bool) { + self.ignore_missing = ignore_missing; + } + + fn check(self) -> Result { + if self.fields.is_empty() { + return Err(format!( + "no valid field found in {} processor", + PROCESSOR_REGEX + )); + } + + if self.patterns.is_empty() { + return Err(format!( + "no valid pattern found in {} processor", + PROCESSOR_REGEX + )); + } + + Ok(self) + } + + fn process_field(&self, val: &str, field: &Field, gr: &GroupRegex) -> Result { + let mut map = Map::default(); + + if let Some(captures) = gr.regex.captures(val) { + for group in &gr.groups { + if let Some(capture) = captures.name(group) { + let value = capture.as_str().to_string(); + let prefix = match &field.target_field { + Some(s) => s, + None => &field.field, + }; + + let key = format!("{prefix}_{group}"); + + map.insert(key, Value::String(value)); + } + } + } + + Ok(map) + } +} + +impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor { + type Error = String; + + fn try_from(value: &yaml_rust::yaml::Hash) -> Result { + let mut processor = RegexProcessor::default(); + + for (k, v) in value.iter() { + let key = k + .as_str() + .ok_or(format!("key must be a string, but got {k:?}"))?; + match key { + FIELD_NAME => { + processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + } + FIELDS_NAME => { + processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + } + PATTERNS_NAME => { + processor.try_with_patterns(yaml_strings(v, PATTERNS_NAME)?)?; + } + IGNORE_MISSING_NAME => { + processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + } + _ => {} + } + } + + processor.check() + } +} + +impl Processor for RegexProcessor { + fn kind(&self) -> &str { + PROCESSOR_REGEX + } + + fn ignore_missing(&self) -> bool { + self.ignore_missing + } + + fn fields(&self) -> &Fields { + &self.fields + } + + fn exec_field(&self, val: &Value, field: &Field) -> Result { + match val { + Value::String(val) => { + let mut map = Map::default(); + for gr in &self.patterns { + let m = self.process_field(val, field, gr)?; + map.extend(m); + } + Ok(map) + } + _ => Err(format!( + "{} processor: expect string value, but got {val:?}", + self.kind() + )), + } + } +} +#[cfg(test)] +mod tests { + use itertools::Itertools; + + use super::RegexProcessor; + use crate::etl::field::Fields; + use crate::etl::processor::Processor; + use crate::etl::value::{Map, Value}; + + #[test] + fn test_process() { + let mut processor = RegexProcessor::default(); + + let cc = "[c=c,n=US_CA_SANJOSE,o=55155]"; + let cg = "[a=12.34.567.89,b=12345678,c=g,n=US_CA_SANJOSE,o=20940]"; + let co = "[a=987.654.321.09,c=o]"; + let cp = "[c=p,n=US_CA_SANJOSE,o=55155]"; + let cw = "[c=w,n=US_CA_SANJOSE,o=55155]"; + let breadcrumbs = Value::String([cc, cg, co, cp, cw].iter().join(",")); + + let values = [ + ("breadcrumbs", breadcrumbs.clone()), + ("breadcrumbs_parent", Value::String(cc.to_string())), + ("breadcrumbs_edge", Value::String(cg.to_string())), + ("breadcrumbs_origin", Value::String(co.to_string())), + ("breadcrumbs_peer", Value::String(cp.to_string())), + ("breadcrumbs_wrapper", Value::String(cw.to_string())), + ] + .into_iter() + .map(|(k, v)| (k.to_string(), v)) + .collect(); + let temporary_map = Map { values }; + + { + // single field (with prefix), multiple patterns + let ff = ["breadcrumbs, breadcrumbs"] + .iter() + .map(|f| f.parse().unwrap()) + .collect(); + processor.with_fields(Fields::new(ff).unwrap()); + + let ccr = "(?\\[[^\\[]*c=c[^\\]]*\\])"; + let cgr = "(?\\[[^\\[]*c=g[^\\]]*\\])"; + let cor = "(?\\[[^\\[]*c=o[^\\]]*\\])"; + let cpr = "(?\\[[^\\[]*c=p[^\\]]*\\])"; + let cwr = "(?\\[[^\\[]*c=w[^\\]]*\\])"; + let patterns = [ccr, cgr, cor, cpr, cwr] + .iter() + .map(|p| p.to_string()) + .collect(); + processor.try_with_patterns(patterns).unwrap(); + + let mut map = Map::default(); + map.insert("breadcrumbs", breadcrumbs.clone()); + let processed_val = processor.exec_map(map).unwrap(); + + assert_eq!(processed_val, Value::Map(temporary_map.clone())); + } + + { + // multiple fields (with prefix), multiple patterns + let ff = [ + "breadcrumbs_parent, parent", + "breadcrumbs_edge, edge", + "breadcrumbs_origin, origin", + "breadcrumbs_peer, peer", + "breadcrumbs_wrapper, wrapper", + ] + .iter() + .map(|f| f.parse().unwrap()) + .collect(); + processor.with_fields(Fields::new(ff).unwrap()); + + let patterns = [ + "a=(?[^,\\]]+)", + "b=(?[^,\\]]+)", + "k=(?[^,\\]]+)", + "l=(?[^,\\]]+)", + "m=(?[^,\\]]+)", + "n=(?[^,\\]]+)", + "o=(?[^,\\]]+)", + ] + .iter() + .map(|p| p.to_string()) + .collect(); + processor.try_with_patterns(patterns).unwrap(); + + let new_values = vec![ + ("edge_ip", Value::String("12.34.567.89".to_string())), + ("edge_request_id", Value::String("12345678".to_string())), + ("edge_geo", Value::String("US_CA_SANJOSE".to_string())), + ("edge_asn", Value::String("20940".to_string())), + ("origin_ip", Value::String("987.654.321.09".to_string())), + ("peer_asn", Value::String("55155".to_string())), + ("peer_geo", Value::String("US_CA_SANJOSE".to_string())), + ("parent_asn", Value::String("55155".to_string())), + ("parent_geo", Value::String("US_CA_SANJOSE".to_string())), + ("wrapper_asn", Value::String("55155".to_string())), + ("wrapper_geo", Value::String("US_CA_SANJOSE".to_string())), + ] + .into_iter() + .map(|(k, v)| (k.to_string(), v)) + .collect(); + + let actual_val = processor.exec_map(temporary_map.clone()).unwrap(); + let mut expected_map = temporary_map.clone(); + expected_map.extend(Map { values: new_values }); + + assert_eq!(Value::Map(expected_map), actual_val); + } + } +} diff --git a/src/pipeline/src/etl/processor/urlencoding.rs b/src/pipeline/src/etl/processor/urlencoding.rs new file mode 100644 index 000000000000..f9019fd19126 --- /dev/null +++ b/src/pipeline/src/etl/processor/urlencoding.rs @@ -0,0 +1,177 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use urlencoding::{decode, encode}; + +use crate::etl::field::{Field, Fields}; +use crate::etl::processor::{ + yaml_bool, yaml_field, yaml_fields, yaml_string, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + METHOD_NAME, +}; +use crate::etl::value::{Map, Value}; + +pub(crate) const PROCESSOR_URL_ENCODING: &str = "urlencoding"; + +#[derive(Debug, Default)] +enum Method { + #[default] + Decode, + Encode, +} + +impl std::fmt::Display for Method { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Method::Decode => write!(f, "decode"), + Method::Encode => write!(f, "encode"), + } + } +} + +impl std::str::FromStr for Method { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "decode" => Ok(Method::Decode), + "encode" => Ok(Method::Encode), + _ => Err(format!("invalid method: {s}")), + } + } +} + +/// only support string value +#[derive(Debug, Default)] +pub struct UrlEncodingProcessor { + fields: Fields, + method: Method, + ignore_missing: bool, +} + +impl UrlEncodingProcessor { + fn with_fields(&mut self, fields: Fields) { + self.fields = fields; + } + + fn with_ignore_missing(&mut self, ignore_missing: bool) { + self.ignore_missing = ignore_missing; + } + + fn with_method(&mut self, method: Method) { + self.method = method; + } + + fn process_field(&self, val: &str, field: &Field) -> Result { + let processed = match self.method { + Method::Encode => encode(val).to_string(), + Method::Decode => decode(val).map_err(|e| e.to_string())?.into_owned(), + }; + let val = Value::String(processed); + + let key = match field.target_field { + Some(ref target_field) => target_field, + None => field.get_field(), + }; + + Ok(Map::one(key, val)) + } +} + +impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor { + type Error = String; + + fn try_from(value: &yaml_rust::yaml::Hash) -> Result { + let mut processor = UrlEncodingProcessor::default(); + + for (k, v) in value.iter() { + let key = k + .as_str() + .ok_or(format!("key must be a string, but got {k:?}"))?; + match key { + FIELD_NAME => { + processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + } + FIELDS_NAME => { + processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + } + + IGNORE_MISSING_NAME => { + processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + } + + METHOD_NAME => { + let method = yaml_string(v, METHOD_NAME)?; + processor.with_method(method.parse()?); + } + + _ => {} + } + } + + Ok(processor) + } +} + +impl crate::etl::processor::Processor for UrlEncodingProcessor { + fn kind(&self) -> &str { + PROCESSOR_URL_ENCODING + } + + fn ignore_missing(&self) -> bool { + self.ignore_missing + } + + fn fields(&self) -> &Fields { + &self.fields + } + + fn exec_field(&self, val: &Value, field: &Field) -> Result { + match val { + Value::String(val) => self.process_field(val, field), + _ => Err(format!( + "{} processor: expect string value, but got {val:?}", + self.kind() + )), + } + } +} + +#[cfg(test)] +mod tests { + use crate::etl::field::{Field, Fields}; + use crate::etl::processor::urlencoding::UrlEncodingProcessor; + use crate::etl::value::{Map, Value}; + + #[test] + fn test_decode_url() { + let field = "url"; + let ff: Field = field.parse().unwrap(); + + let decoded = "//BC/[a=6.7.8.9,c=g,k=0,l=1]"; + let encoded = "%2F%2FBC%2F%5Ba%3D6.7.8.9%2Cc%3Dg%2Ck%3D0%2Cl%3D1%5D"; + + let mut processor = UrlEncodingProcessor::default(); + processor.with_fields(Fields::one(ff.clone())); + + { + let result = processor.process_field(encoded, &ff).unwrap(); + assert_eq!(Map::one(field, Value::String(decoded.into())), result) + } + { + processor.with_method(super::Method::Encode); + let result = processor.process_field(decoded, &ff).unwrap(); + assert_eq!(Map::one(field, Value::String(encoded.into())), result) + } + } +} diff --git a/src/pipeline/src/etl/transform/index.rs b/src/pipeline/src/etl/transform/index.rs new file mode 100644 index 000000000000..b554824f52a3 --- /dev/null +++ b/src/pipeline/src/etl/transform/index.rs @@ -0,0 +1,57 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +const INDEX_TIMESTAMP: &str = "timestamp"; +const INDEX_TAG: &str = "tag"; +const INDEX_FULLTEXT: &str = "fulltext"; + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum Index { + Timestamp, + Tag, + Fulltext, +} + +impl std::fmt::Display for Index { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let index = match self { + Index::Timestamp => INDEX_TIMESTAMP, + Index::Tag => INDEX_TAG, + Index::Fulltext => INDEX_FULLTEXT, + }; + + write!(f, "{}", index) + } +} + +impl TryFrom for Index { + type Error = String; + + fn try_from(value: String) -> Result { + Index::try_from(value.as_str()) + } +} + +impl TryFrom<&str> for Index { + type Error = String; + + fn try_from(value: &str) -> Result { + match value { + INDEX_TIMESTAMP => Ok(Index::Timestamp), + INDEX_TAG => Ok(Index::Tag), + INDEX_FULLTEXT => Ok(Index::Fulltext), + _ => Err(format!("unsupported index type: {}", value)), + } + } +} diff --git a/src/pipeline/src/etl/transform/mod.rs b/src/pipeline/src/etl/transform/mod.rs new file mode 100644 index 000000000000..1c1a0f4141de --- /dev/null +++ b/src/pipeline/src/etl/transform/mod.rs @@ -0,0 +1,205 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod index; +pub mod transformer; + +use itertools::Itertools; + +use crate::etl::field::Fields; +use crate::etl::processor::{yaml_field, yaml_fields, yaml_string}; +use crate::etl::transform::index::Index; +use crate::etl::value::Value; + +const TRANSFORM_FIELD: &str = "field"; +const TRANSFORM_FIELDS: &str = "fields"; +const TRANSFORM_TYPE: &str = "type"; +const TRANSFORM_INDEX: &str = "index"; +const TRANSFORM_DEFAULT: &str = "default"; + +pub use transformer::greptime::GreptimeTransformer; +// pub use transformer::noop::NoopTransformer; + +pub trait Transformer: std::fmt::Display + Sized + Send + Sync + 'static { + type Output; + + fn new(transforms: Transforms) -> Result; + fn transform(&self, val: crate::etl::value::Value) -> Result; +} + +#[derive(Debug, Default, Clone)] +pub struct Transforms { + transforms: Vec, +} + +impl std::fmt::Display for Transforms { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let transforms = self + .transforms + .iter() + .map(|field| field.to_string()) + .join(", "); + + write!(f, "{}", transforms) + } +} + +impl std::ops::Deref for Transforms { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.transforms + } +} + +impl std::ops::DerefMut for Transforms { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.transforms + } +} + +impl TryFrom<&Vec> for Transforms { + type Error = String; + + fn try_from(docs: &Vec) -> Result { + let mut transforms = vec![]; + + for doc in docs { + let transform: Transform = doc + .as_hash() + .ok_or("transform element must be a map".to_string())? + .try_into()?; + transforms.push(transform); + } + + Ok(Transforms { transforms }) + } +} + +/// only field is required +#[derive(Debug, Clone)] +pub struct Transform { + pub fields: Fields, + + pub type_: Value, + + pub default: Option, + + pub index: Option, +} + +impl std::fmt::Display for Transform { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let index = if let Some(index) = &self.index { + format!(", index: {}", index) + } else { + "".to_string() + }; + + let fields = format!("field(s): {}", self.fields); + let type_ = format!("type: {}", self.type_); + + write!(f, "{type_}{index}, {fields}") + } +} + +impl Default for Transform { + fn default() -> Self { + Transform { + fields: Fields::default(), + type_: Value::Null, + default: None, + index: None, + } + } +} + +impl Transform { + fn with_fields(&mut self, fields: Fields) { + self.fields = fields; + } + + fn with_type(&mut self, type_: Value) { + self.type_ = type_; + } + + fn try_default(&mut self, default: Value) -> Result<(), String> { + match (&self.type_, &default) { + (Value::Null, _) => Err(format!( + "transform {} type MUST BE set before default {}", + self.fields, &default, + )), + (_, Value::Null) => Ok(()), // if default is not set, then it will be regarded as default null + (_, _) => { + let target = self + .type_ + .parse_str_value(default.to_str_value().as_str())?; + self.default = Some(target); + Ok(()) + } + } + } + + fn with_index(&mut self, index: Index) { + self.index = Some(index); + } + + pub(crate) fn get_default(&self) -> Option<&Value> { + self.default.as_ref() + } +} + +impl TryFrom<&yaml_rust::yaml::Hash> for Transform { + type Error = String; + + fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { + let mut transform = Transform::default(); + + let mut default_opt = None; + + for (k, v) in hash { + let key = k.as_str().ok_or("key must be a string")?; + match key { + TRANSFORM_FIELD => { + transform.with_fields(Fields::one(yaml_field(v, TRANSFORM_FIELD)?)); + } + + TRANSFORM_FIELDS => { + transform.with_fields(yaml_fields(v, TRANSFORM_FIELDS)?); + } + + TRANSFORM_TYPE => { + let t = yaml_string(v, TRANSFORM_TYPE)?; + transform.with_type(Value::parse_str_type(&t)?); + } + + TRANSFORM_INDEX => { + let index = yaml_string(v, TRANSFORM_INDEX)?; + transform.with_index(index.try_into()?); + } + + TRANSFORM_DEFAULT => { + default_opt = Some(Value::try_from(v)?); + } + _ => {} + } + } + + if let Some(default) = default_opt { + transform.try_default(default)?; + } + + Ok(transform) + } +} diff --git a/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs new file mode 100644 index 000000000000..48b612e3d53a --- /dev/null +++ b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs @@ -0,0 +1,310 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use greptime_proto::v1::value::ValueData; +use greptime_proto::v1::{ColumnDataType, ColumnSchema, SemanticType}; + +use crate::etl::transform::index::Index; +use crate::etl::transform::Transform; +use crate::etl::value::{Epoch, Time, Value}; + +impl TryFrom for ValueData { + type Error = String; + + fn try_from(value: Value) -> Result { + match value { + Value::Null => Err("Null type not supported".to_string()), + + Value::Int8(v) => Ok(ValueData::I32Value(v as i32)), + Value::Int16(v) => Ok(ValueData::I32Value(v as i32)), + Value::Int32(v) => Ok(ValueData::I32Value(v)), + Value::Int64(v) => Ok(ValueData::I64Value(v)), + + Value::Uint8(v) => Ok(ValueData::U32Value(v as u32)), + Value::Uint16(v) => Ok(ValueData::U32Value(v as u32)), + Value::Uint32(v) => Ok(ValueData::U32Value(v)), + Value::Uint64(v) => Ok(ValueData::U64Value(v)), + + Value::Float32(v) => Ok(ValueData::F32Value(v)), + Value::Float64(v) => Ok(ValueData::F64Value(v)), + + Value::Boolean(v) => Ok(ValueData::BoolValue(v)), + Value::String(v) => Ok(ValueData::StringValue(v.clone())), + + Value::Time(Time { nanosecond, .. }) => Ok(ValueData::TimeNanosecondValue(nanosecond)), + + Value::Epoch(Epoch::Nanosecond(ns)) => Ok(ValueData::TimestampNanosecondValue(ns)), + Value::Epoch(Epoch::Microsecond(us)) => Ok(ValueData::TimestampMicrosecondValue(us)), + Value::Epoch(Epoch::Millisecond(ms)) => Ok(ValueData::TimestampMillisecondValue(ms)), + Value::Epoch(Epoch::Second(s)) => Ok(ValueData::TimestampSecondValue(s)), + + Value::Array(_) => unimplemented!("Array type not supported"), + Value::Map(_) => unimplemented!("Object type not supported"), + } + } +} + +// TODO(yuanbohan): add fulltext support in datatype_extension +pub(crate) fn coerce_columns(transform: &Transform) -> Result, String> { + let mut columns = Vec::new(); + + for field in transform.fields.iter() { + let column_name = field.get_target_field().to_string(); + + let datatype = coerce_type(transform)? as i32; + + let semantic_type = coerce_semantic_type(transform) as i32; + + let column = ColumnSchema { + column_name, + datatype, + semantic_type, + datatype_extension: None, + }; + columns.push(column); + } + + Ok(columns) +} + +fn coerce_semantic_type(transform: &Transform) -> SemanticType { + match transform.index { + Some(Index::Tag) => SemanticType::Tag, + Some(Index::Timestamp) => SemanticType::Timestamp, + Some(Index::Fulltext) => unimplemented!("Fulltext"), + None => SemanticType::Field, + } +} + +fn coerce_type(transform: &Transform) -> Result { + match transform.type_ { + Value::Int8(_) => Ok(ColumnDataType::Int8), + Value::Int16(_) => Ok(ColumnDataType::Int16), + Value::Int32(_) => Ok(ColumnDataType::Int32), + Value::Int64(_) => Ok(ColumnDataType::Int64), + + Value::Uint8(_) => Ok(ColumnDataType::Uint8), + Value::Uint16(_) => Ok(ColumnDataType::Uint16), + Value::Uint32(_) => Ok(ColumnDataType::Uint32), + Value::Uint64(_) => Ok(ColumnDataType::Uint64), + + Value::Float32(_) => Ok(ColumnDataType::Float32), + Value::Float64(_) => Ok(ColumnDataType::Float64), + + Value::Boolean(_) => Ok(ColumnDataType::Boolean), + Value::String(_) => Ok(ColumnDataType::String), + + Value::Time(_) => Ok(ColumnDataType::TimestampNanosecond), + + Value::Epoch(Epoch::Nanosecond(_)) => Ok(ColumnDataType::TimestampNanosecond), + Value::Epoch(Epoch::Microsecond(_)) => Ok(ColumnDataType::TimestampMicrosecond), + Value::Epoch(Epoch::Millisecond(_)) => Ok(ColumnDataType::TimestampMillisecond), + Value::Epoch(Epoch::Second(_)) => Ok(ColumnDataType::TimestampSecond), + + Value::Array(_) => unimplemented!("Array"), + Value::Map(_) => unimplemented!("Object"), + + Value::Null => Err(format!( + "Null type not supported when to coerce '{}' type", + transform.fields + )), + } +} + +pub(crate) fn coerce_value( + val: &Value, + transform: &Transform, +) -> Result, String> { + match val { + Value::Null => Ok(None), + + Value::Int8(n) => coerce_i64_value(*n as i64, transform), + Value::Int16(n) => coerce_i64_value(*n as i64, transform), + Value::Int32(n) => coerce_i64_value(*n as i64, transform), + Value::Int64(n) => coerce_i64_value(*n, transform), + + Value::Uint8(n) => coerce_u64_value(*n as u64, transform), + Value::Uint16(n) => coerce_u64_value(*n as u64, transform), + Value::Uint32(n) => coerce_u64_value(*n as u64, transform), + Value::Uint64(n) => coerce_u64_value(*n, transform), + + Value::Float32(n) => coerce_f64_value(*n as f64, transform), + Value::Float64(n) => coerce_f64_value(*n, transform), + + Value::Boolean(b) => coerce_bool_value(*b, transform), + Value::String(s) => coerce_string_value(s, transform), + + Value::Time(Time { nanosecond, .. }) => { + Ok(Some(ValueData::TimestampNanosecondValue(*nanosecond))) + } + + Value::Epoch(Epoch::Nanosecond(ns)) => Ok(Some(ValueData::TimestampNanosecondValue(*ns))), + Value::Epoch(Epoch::Microsecond(us)) => Ok(Some(ValueData::TimestampMicrosecondValue(*us))), + Value::Epoch(Epoch::Millisecond(ms)) => Ok(Some(ValueData::TimestampMillisecondValue(*ms))), + Value::Epoch(Epoch::Second(s)) => Ok(Some(ValueData::TimestampSecondValue(*s))), + + Value::Array(_) => unimplemented!("Array type not supported"), + Value::Map(_) => unimplemented!("Object type not supported"), + } +} + +fn coerce_bool_value(b: bool, transform: &Transform) -> Result, String> { + let val = match transform.type_ { + Value::Int8(_) => ValueData::I8Value(b as i32), + Value::Int16(_) => ValueData::I16Value(b as i32), + Value::Int32(_) => ValueData::I32Value(b as i32), + Value::Int64(_) => ValueData::I64Value(b as i64), + + Value::Uint8(_) => ValueData::U8Value(b as u32), + Value::Uint16(_) => ValueData::U16Value(b as u32), + Value::Uint32(_) => ValueData::U32Value(b as u32), + Value::Uint64(_) => ValueData::U64Value(b as u64), + + Value::Float32(_) => ValueData::F32Value(if b { 1.0 } else { 0.0 }), + Value::Float64(_) => ValueData::F64Value(if b { 1.0 } else { 0.0 }), + + Value::Boolean(_) => ValueData::BoolValue(b), + Value::String(_) => ValueData::StringValue(b.to_string()), + + Value::Time(_) => return Err("Boolean type not supported for Time".to_string()), + Value::Epoch(_) => return Err("Boolean type not supported for Epoch".to_string()), + + Value::Array(_) => unimplemented!("Array type not supported"), + Value::Map(_) => unimplemented!("Object type not supported"), + + Value::Null => return Ok(None), + }; + + Ok(Some(val)) +} + +fn coerce_i64_value(n: i64, transform: &Transform) -> Result, String> { + let val = match transform.type_ { + Value::Int8(_) => ValueData::I8Value(n as i32), + Value::Int16(_) => ValueData::I16Value(n as i32), + Value::Int32(_) => ValueData::I32Value(n as i32), + Value::Int64(_) => ValueData::I64Value(n), + + Value::Uint8(_) => ValueData::U8Value(n as u32), + Value::Uint16(_) => ValueData::U16Value(n as u32), + Value::Uint32(_) => ValueData::U32Value(n as u32), + Value::Uint64(_) => ValueData::U64Value(n as u64), + + Value::Float32(_) => ValueData::F32Value(n as f32), + Value::Float64(_) => ValueData::F64Value(n as f64), + + Value::Boolean(_) => ValueData::BoolValue(n != 0), + Value::String(_) => ValueData::StringValue(n.to_string()), + + Value::Time(_) => return Err("Integer type not supported for Time".to_string()), + Value::Epoch(_) => return Err("Integer type not supported for Epoch".to_string()), + + Value::Array(_) => unimplemented!("Array type not supported"), + Value::Map(_) => unimplemented!("Object type not supported"), + + Value::Null => return Ok(None), + }; + + Ok(Some(val)) +} + +fn coerce_u64_value(n: u64, transform: &Transform) -> Result, String> { + let val = match transform.type_ { + Value::Int8(_) => ValueData::I8Value(n as i32), + Value::Int16(_) => ValueData::I16Value(n as i32), + Value::Int32(_) => ValueData::I32Value(n as i32), + Value::Int64(_) => ValueData::I64Value(n as i64), + + Value::Uint8(_) => ValueData::U8Value(n as u32), + Value::Uint16(_) => ValueData::U16Value(n as u32), + Value::Uint32(_) => ValueData::U32Value(n as u32), + Value::Uint64(_) => ValueData::U64Value(n), + + Value::Float32(_) => ValueData::F32Value(n as f32), + Value::Float64(_) => ValueData::F64Value(n as f64), + + Value::Boolean(_) => ValueData::BoolValue(n != 0), + Value::String(_) => ValueData::StringValue(n.to_string()), + + Value::Time(_) => return Err("Integer type not supported for Time".to_string()), + Value::Epoch(_) => return Err("Integer type not supported for Epoch".to_string()), + + Value::Array(_) => unimplemented!("Array type not supported"), + Value::Map(_) => unimplemented!("Object type not supported"), + + Value::Null => return Ok(None), + }; + + Ok(Some(val)) +} + +fn coerce_f64_value(n: f64, transform: &Transform) -> Result, String> { + let val = match transform.type_ { + Value::Int8(_) => ValueData::I8Value(n as i32), + Value::Int16(_) => ValueData::I16Value(n as i32), + Value::Int32(_) => ValueData::I32Value(n as i32), + Value::Int64(_) => ValueData::I64Value(n as i64), + + Value::Uint8(_) => ValueData::U8Value(n as u32), + Value::Uint16(_) => ValueData::U16Value(n as u32), + Value::Uint32(_) => ValueData::U32Value(n as u32), + Value::Uint64(_) => ValueData::U64Value(n as u64), + + Value::Float32(_) => ValueData::F32Value(n as f32), + Value::Float64(_) => ValueData::F64Value(n), + + Value::Boolean(_) => ValueData::BoolValue(n != 0.0), + Value::String(_) => ValueData::StringValue(n.to_string()), + + Value::Time(_) => return Err("Float type not supported for Time".to_string()), + Value::Epoch(_) => return Err("Float type not supported for Epoch".to_string()), + + Value::Array(_) => unimplemented!("Array type not supported"), + Value::Map(_) => unimplemented!("Object type not supported"), + + Value::Null => return Ok(None), + }; + + Ok(Some(val)) +} + +fn coerce_string_value(s: &str, transform: &Transform) -> Result, String> { + let val = match transform.type_ { + Value::Int8(_) => ValueData::I8Value(s.parse::().map_err(|e| e.to_string())?), + Value::Int16(_) => ValueData::I16Value(s.parse::().map_err(|e| e.to_string())?), + Value::Int32(_) => ValueData::I32Value(s.parse::().map_err(|e| e.to_string())?), + Value::Int64(_) => ValueData::I64Value(s.parse::().map_err(|e| e.to_string())?), + + Value::Uint8(_) => ValueData::U8Value(s.parse::().map_err(|e| e.to_string())?), + Value::Uint16(_) => ValueData::U16Value(s.parse::().map_err(|e| e.to_string())?), + Value::Uint32(_) => ValueData::U32Value(s.parse::().map_err(|e| e.to_string())?), + Value::Uint64(_) => ValueData::U64Value(s.parse::().map_err(|e| e.to_string())?), + + Value::Float32(_) => ValueData::F32Value(s.parse::().map_err(|e| e.to_string())?), + Value::Float64(_) => ValueData::F64Value(s.parse::().map_err(|e| e.to_string())?), + + Value::Boolean(_) => ValueData::BoolValue(s.parse::().map_err(|e| e.to_string())?), + Value::String(_) => ValueData::StringValue(s.to_string()), + + Value::Time(_) => return Err("String type not supported for Time".to_string()), + Value::Epoch(_) => return Err("String type not supported for Epoch".to_string()), + + Value::Array(_) => unimplemented!("Array type not supported"), + Value::Map(_) => unimplemented!("Object type not supported"), + + Value::Null => return Ok(None), + }; + + Ok(Some(val)) +} diff --git a/src/pipeline/src/etl/transform/transformer/greptime/mod.rs b/src/pipeline/src/etl/transform/transformer/greptime/mod.rs new file mode 100644 index 000000000000..933f621d6a14 --- /dev/null +++ b/src/pipeline/src/etl/transform/transformer/greptime/mod.rs @@ -0,0 +1,172 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod coerce; + +use std::collections::HashSet; + +use coerce::{coerce_columns, coerce_value}; +use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue}; +use itertools::Itertools; + +use crate::etl::field::{Field, Fields}; +use crate::etl::transform::index::Index; +use crate::etl::transform::{Transform, Transformer, Transforms}; +use crate::etl::value::{Array, Epoch, Map, Value}; + +const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp"; + +/// fields not in the columns will be discarded +/// to prevent automatic column creation in GreptimeDB +#[derive(Debug, Clone)] +pub struct GreptimeTransformer { + transforms: Transforms, +} + +impl GreptimeTransformer { + fn default_greptime_timestamp_column() -> Transform { + let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0); + let type_ = Value::Epoch(Epoch::Nanosecond(ns)); + let default = Some(type_.clone()); + let field = Field::new(DEFAULT_GREPTIME_TIMESTAMP_COLUMN); + let fields = Fields::new(vec![field]).unwrap(); + + Transform { + fields, + type_, + default, + index: Some(Index::Timestamp), + } + } + + fn schemas(&self) -> Result, String> { + let mut schema = vec![]; + for transform in self.transforms.iter() { + schema.extend(coerce_columns(transform)?); + } + Ok(schema) + } + + fn transform_map(&self, map: &Map) -> Result { + let mut values = vec![]; + + for transform in self.transforms.iter() { + for field in transform.fields.iter() { + let value_data = match map.get(field.get_field()) { + Some(val) => coerce_value(val, transform)?, + None if transform.get_default().is_some() => { + coerce_value(transform.get_default().unwrap(), transform)? + } + None => None, + }; + values.push(GreptimeValue { value_data }); + } + } + + Ok(Row { values }) + } + + fn transform_array(&self, arr: &Array) -> Result, String> { + let mut rows = vec![]; + for v in arr.iter() { + match v { + Value::Map(map) => { + let row = self.transform_map(map)?; + rows.push(row); + } + _ => return Err(format!("Expected map, found: {v:?}")), + } + } + Ok(rows) + } +} + +impl std::fmt::Display for GreptimeTransformer { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + writeln!(f, "GreptimeTransformer.\nColumns: {}", self.transforms) + } +} + +impl Transformer for GreptimeTransformer { + type Output = Rows; + + fn new(mut transforms: Transforms) -> Result { + if transforms.is_empty() { + return Err("transform cannot be empty".to_string()); + } + + let mut column_names_set = HashSet::new(); + let mut timestamp_columns = vec![]; + + for transform in transforms.iter() { + let target_fields_set = transform + .fields + .iter() + .map(|f| f.get_target_field()) + .collect::>(); + + let intersections: Vec<_> = column_names_set.intersection(&target_fields_set).collect(); + if !intersections.is_empty() { + let duplicates = intersections.iter().join(","); + return Err(format!( + "column name must be unique, but got duplicated: {duplicates}" + )); + } + + column_names_set.extend(target_fields_set); + + if let Some(idx) = transform.index { + if idx == Index::Timestamp { + match transform.fields.len() { + 1 => timestamp_columns.push(transform.fields.first().unwrap().get_field()), + _ => return Err(format!( + "Illegal to set multiple timestamp Index columns, please set only one: {}", + transform.fields.get_target_fields().join(", ") + )), + } + } + } + } + + match timestamp_columns.len() { + 0 => { + transforms.push(GreptimeTransformer::default_greptime_timestamp_column()); + Ok(GreptimeTransformer { transforms }) + } + 1 => Ok(GreptimeTransformer { transforms }), + _ => { + let columns: String = timestamp_columns.iter().map(|s| s.to_string()).join(", "); + let count = timestamp_columns.len(); + Err( + format!("transform must have exactly one field specified as timestamp Index, but got {count}: {columns}") + ) + } + } + } + + fn transform(&self, value: Value) -> Result { + let schema = self.schemas()?; + match value { + Value::Map(map) => { + let rows = vec![self.transform_map(&map)?]; + Ok(Rows { schema, rows }) + } + Value::Array(arr) => { + let rows = self.transform_array(&arr)?; + Ok(Rows { schema, rows }) + } + _ => Err(format!("Expected map or array, found: {}", value)), + } + } +} diff --git a/src/pipeline/src/etl/transform/transformer/mod.rs b/src/pipeline/src/etl/transform/transformer/mod.rs new file mode 100644 index 000000000000..173aac61bfe5 --- /dev/null +++ b/src/pipeline/src/etl/transform/transformer/mod.rs @@ -0,0 +1,16 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod greptime; +pub mod noop; diff --git a/src/pipeline/src/etl/transform/transformer/noop.rs b/src/pipeline/src/etl/transform/transformer/noop.rs new file mode 100644 index 000000000000..6bd7a208c981 --- /dev/null +++ b/src/pipeline/src/etl/transform/transformer/noop.rs @@ -0,0 +1,36 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::etl::transform::{Transformer, Transforms}; +use crate::etl::value::Value; + +pub struct NoopTransformer; + +impl std::fmt::Display for NoopTransformer { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "NoopTransformer") + } +} + +impl Transformer for NoopTransformer { + type Output = Value; + + fn new(_transforms: Transforms) -> Result { + Ok(NoopTransformer) + } + + fn transform(&self, val: Value) -> Result { + Ok(val) + } +} diff --git a/src/pipeline/src/etl/value/array.rs b/src/pipeline/src/etl/value/array.rs new file mode 100644 index 000000000000..617d9beed348 --- /dev/null +++ b/src/pipeline/src/etl/value/array.rs @@ -0,0 +1,56 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::etl::value::Value; + +#[derive(Debug, Clone, PartialEq, Default)] +pub struct Array { + pub values: Vec, +} + +impl Array { + pub fn new() -> Self { + Array { values: vec![] } + } +} + +impl std::fmt::Display for Array { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let values = self + .values + .iter() + .map(|v| v.to_string()) + .collect::>() + .join(", "); + write!(f, "[{}]", values) + } +} + +impl std::ops::Deref for Array { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.values + } +} + +impl IntoIterator for Array { + type Item = Value; + + type IntoIter = std::vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.values.into_iter() + } +} diff --git a/src/pipeline/src/etl/value/map.rs b/src/pipeline/src/etl/value/map.rs new file mode 100644 index 000000000000..3b03ab311fb0 --- /dev/null +++ b/src/pipeline/src/etl/value/map.rs @@ -0,0 +1,58 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use crate::etl::value::Value; + +#[derive(Debug, Clone, PartialEq, Default)] +pub struct Map { + pub values: HashMap, +} + +impl Map { + pub fn one(key: impl Into, value: Value) -> Map { + let mut map = Map::default(); + map.insert(key, value); + map + } + + pub fn insert(&mut self, key: impl Into, value: Value) { + self.values.insert(key.into(), value); + } + + pub fn extend(&mut self, Map { values }: Map) { + self.values.extend(values); + } +} + +impl std::ops::Deref for Map { + type Target = HashMap; + + fn deref(&self) -> &Self::Target { + &self.values + } +} + +impl std::fmt::Display for Map { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let values = self + .values + .iter() + .map(|(k, v)| format!("{}: {}", k, v)) + .collect::>() + .join(", "); + write!(f, "{{{}}}", values) + } +} diff --git a/src/pipeline/src/etl/value/mod.rs b/src/pipeline/src/etl/value/mod.rs new file mode 100644 index 000000000000..a9d7c34feba1 --- /dev/null +++ b/src/pipeline/src/etl/value/mod.rs @@ -0,0 +1,303 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod array; +pub mod map; +pub mod time; + +pub use std::collections::HashMap; + +pub use array::Array; +pub use map::Map; +pub use time::{Epoch, Time}; + +/// Value can be used as type +/// acts as value: the enclosed value is the actual value +/// acts as type: the enclosed value is the default value +#[derive(Debug, Clone, PartialEq)] +pub enum Value { + // as value: null + // as type: no type specified + Null, + + Int8(i8), + Int16(i16), + Int32(i32), + Int64(i64), + + Uint8(u8), + Uint16(u16), + Uint32(u32), + Uint64(u64), + + Float32(f32), + Float64(f64), + + Boolean(bool), + String(String), + + Time(Time), + Epoch(Epoch), + + Array(Array), + Map(Map), +} + +impl Value { + pub fn is_null(&self) -> bool { + matches!(self, Value::Null) + } + + pub fn parse_str_type(t: &str) -> Result { + let mut parts = t.splitn(2, ','); + let head = parts.next().unwrap_or_default(); + let tail = parts.next().map(|s| s.trim().to_string()); + match head.to_lowercase().as_str() { + "int8" => Ok(Value::Int8(0)), + "int16" => Ok(Value::Int16(0)), + "int32" => Ok(Value::Int32(0)), + "int64" => Ok(Value::Int64(0)), + + "uint8" => Ok(Value::Uint8(0)), + "uint16" => Ok(Value::Uint16(0)), + "uint32" => Ok(Value::Uint32(0)), + "uint64" => Ok(Value::Uint64(0)), + + "float32" => Ok(Value::Float32(0.0)), + "float64" => Ok(Value::Float64(0.0)), + + "boolean" => Ok(Value::Boolean(false)), + "string" => Ok(Value::String("".to_string())), + + "time" => Ok(Value::Time(Time::default())), + "epoch" => match tail { + Some(resolution) if !resolution.is_empty() => match resolution.as_str() { + time::NANOSECOND_RESOLUTION | time::NANO_RESOLUTION | time::NS_RESOLUTION => { + Ok(Value::Epoch(Epoch::Nanosecond(0))) + } + time::MICROSECOND_RESOLUTION | time::MICRO_RESOLUTION | time::US_RESOLUTION => { + Ok(Value::Epoch(Epoch::Microsecond(0))) + } + time::MILLISECOND_RESOLUTION | time::MILLI_RESOLUTION | time::MS_RESOLUTION => { + Ok(Value::Epoch(Epoch::Millisecond(0))) + } + time::SECOND_RESOLUTION | time::SEC_RESOLUTION | time::S_RESOLUTION => { + Ok(Value::Epoch(Epoch::Second(0))) + } + _ => Err(format!( + "invalid resolution: '{resolution}'. Available resolutions: {}", + time::VALID_RESOLUTIONS.join(",") + )), + }, + _ => Err(format!( + "resolution MUST BE set for epoch type: '{t}'. Available resolutions: {}", + time::VALID_RESOLUTIONS.join(", ") + )), + }, + + "array" => Ok(Value::Array(Array::default())), + "map" => Ok(Value::Map(Map::default())), + + _ => Err(format!("failed to parse type: '{t}'")), + } + } + + /// only support string, bool, number, null + pub fn parse_str_value(&self, v: &str) -> Result { + match self { + Value::Int8(_) => v + .parse::() + .map(Value::Int8) + .map_err(|e| format!("failed to parse int8: {}", e)), + Value::Int16(_) => v + .parse::() + .map(Value::Int16) + .map_err(|e| format!("failed to parse int16: {}", e)), + Value::Int32(_) => v + .parse::() + .map(Value::Int32) + .map_err(|e| format!("failed to parse int32: {}", e)), + Value::Int64(_) => v + .parse::() + .map(Value::Int64) + .map_err(|e| format!("failed to parse int64: {}", e)), + + Value::Uint8(_) => v + .parse::() + .map(Value::Uint8) + .map_err(|e| format!("failed to parse uint8: {}", e)), + Value::Uint16(_) => v + .parse::() + .map(Value::Uint16) + .map_err(|e| format!("failed to parse uint16: {}", e)), + Value::Uint32(_) => v + .parse::() + .map(Value::Uint32) + .map_err(|e| format!("failed to parse uint32: {}", e)), + Value::Uint64(_) => v + .parse::() + .map(Value::Uint64) + .map_err(|e| format!("failed to parse uint64: {}", e)), + + Value::Float32(_) => v + .parse::() + .map(Value::Float32) + .map_err(|e| format!("failed to parse float32: {}", e)), + Value::Float64(_) => v + .parse::() + .map(Value::Float64) + .map_err(|e| format!("failed to parse float64: {}", e)), + + Value::Boolean(_) => v + .parse::() + .map(Value::Boolean) + .map_err(|e| format!("failed to parse bool: {}", e)), + Value::String(_) => Ok(Value::String(v.to_string())), + + Value::Null => Ok(Value::Null), + + _ => Err(format!("default value not unsupported for type {}", self)), + } + } + + /// only support string, bool, number, null + pub fn to_str_value(&self) -> String { + match self { + Value::Int8(v) => format!("{}", v), + Value::Int16(v) => format!("{}", v), + Value::Int32(v) => format!("{}", v), + Value::Int64(v) => format!("{}", v), + + Value::Uint8(v) => format!("{}", v), + Value::Uint16(v) => format!("{}", v), + Value::Uint32(v) => format!("{}", v), + Value::Uint64(v) => format!("{}", v), + + Value::Float32(v) => format!("{}", v), + Value::Float64(v) => format!("{}", v), + + Value::Boolean(v) => format!("{}", v), + Value::String(v) => v.to_string(), + + v => v.to_string(), + } + } +} + +impl std::fmt::Display for Value { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let str = match self { + Value::Null => "null".to_string(), + + Value::Int8(v) => format!("int8({})", v), + Value::Int16(v) => format!("int16({})", v), + Value::Int32(v) => format!("int32({})", v), + Value::Int64(v) => format!("int64({})", v), + + Value::Uint8(v) => format!("uint8({})", v), + Value::Uint16(v) => format!("uint16({})", v), + Value::Uint32(v) => format!("uint32({})", v), + Value::Uint64(v) => format!("uint64({})", v), + + Value::Float32(v) => format!("float32({})", v), + Value::Float64(v) => format!("float64({})", v), + + Value::Boolean(v) => format!("boolean({})", v), + Value::String(v) => format!("string({})", v), + + Value::Time(v) => format!("time({})", v), + Value::Epoch(v) => format!("epoch({})", v), + + Value::Array(v) => format!("{}", v), + Value::Map(v) => format!("{}", v), + }; + + write!(f, "{}", str) + } +} + +impl TryFrom for Value { + type Error = String; + + fn try_from(v: serde_json::Value) -> Result { + match v { + serde_json::Value::Null => Ok(Value::Null), + serde_json::Value::Bool(v) => Ok(Value::Boolean(v)), + serde_json::Value::Number(v) => { + if let Some(v) = v.as_i64() { + Ok(Value::Int64(v)) + } else if let Some(v) = v.as_u64() { + Ok(Value::Uint64(v)) + } else if let Some(v) = v.as_f64() { + Ok(Value::Float64(v)) + } else { + Err(format!("unsupported number type: {}", v)) + } + } + serde_json::Value::String(v) => Ok(Value::String(v)), + serde_json::Value::Array(v) => { + let mut values = vec![]; + for v in v { + values.push(Value::try_from(v)?); + } + Ok(Value::Array(Array { values })) + } + serde_json::Value::Object(v) => { + let mut values = HashMap::new(); + for (k, v) in v { + values.insert(k, Value::try_from(v)?); + } + Ok(Value::Map(Map { values })) + } + } + } +} + +impl TryFrom<&yaml_rust::Yaml> for Value { + type Error = String; + + fn try_from(v: &yaml_rust::Yaml) -> Result { + match v { + yaml_rust::Yaml::Null => Ok(Value::Null), + yaml_rust::Yaml::Boolean(v) => Ok(Value::Boolean(*v)), + yaml_rust::Yaml::Integer(v) => Ok(Value::Int64(*v)), + yaml_rust::Yaml::Real(v) => { + if let Ok(v) = v.parse() { + Ok(Value::Float64(v)) + } else { + Err(format!("failed to parse float64: {}", v)) + } + } + yaml_rust::Yaml::String(v) => Ok(Value::String(v.to_string())), + yaml_rust::Yaml::Array(arr) => { + let mut values = vec![]; + for v in arr { + values.push(Value::try_from(v)?); + } + Ok(Value::Array(Array { values })) + } + yaml_rust::Yaml::Hash(v) => { + let mut values = HashMap::new(); + for (k, v) in v { + let key = k + .as_str() + .ok_or(format!("key in Hash must be a string, but got {v:?}"))?; + values.insert(key.to_string(), Value::try_from(v)?); + } + Ok(Value::Map(Map { values })) + } + _ => Err(format!("unsupported yaml type: {v:?}")), + } + } +} diff --git a/src/pipeline/src/etl/value/time.rs b/src/pipeline/src/etl/value/time.rs new file mode 100644 index 000000000000..5dbc4d4e092e --- /dev/null +++ b/src/pipeline/src/etl/value/time.rs @@ -0,0 +1,187 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_telemetry::error; + +#[derive(Debug, Clone, PartialEq)] +pub struct Time { + pub value: String, + pub nanosecond: i64, + pub format: Option, + pub timezone: Option, + // TODO(yuanbohan): support locale + // pub locale: Option, +} + +impl Time { + pub(crate) fn new(v: impl Into, nanosecond: i64) -> Self { + let value = v.into(); + Time { + value, + nanosecond, + format: None, + timezone: None, + } + } + + pub(crate) fn with_format(&mut self, format: impl Into) { + self.format = Some(format.into()); + } + + pub(crate) fn with_timezone(&mut self, timezone: Option) { + self.timezone = timezone; + } + + pub(crate) fn timestamp_nanos(&self) -> i64 { + self.nanosecond + } + + pub(crate) fn timestamp_micros(&self) -> i64 { + self.nanosecond / 1_000 + } + + pub(crate) fn timestamp_millis(&self) -> i64 { + self.nanosecond / 1_000_000 + } + + pub(crate) fn timestamp(&self) -> i64 { + self.nanosecond / 1_000_000_000 + } +} + +impl Default for Time { + fn default() -> Self { + let dt = chrono::Utc::now(); + let v = dt.to_rfc3339(); + let ns = match dt.timestamp_nanos_opt() { + Some(ns) => ns, + None => { + error!("failed to get nanosecond from timestamp, use 0 instead"); + 0 + } + }; + Time::new(v, ns) + } +} + +impl std::fmt::Display for Time { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let format = if let Some(format) = &self.format { + format!(", format: {}", format) + } else { + "".to_string() + }; + + let timezone = if let Some(timezone) = &self.timezone { + format!(", timezone: {}", timezone) + } else { + "".to_string() + }; + + write!(f, "{}, format: {}{}", self.value, format, timezone) + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum Epoch { + Nanosecond(i64), + Microsecond(i64), + Millisecond(i64), + Second(i64), +} + +pub(crate) const NANOSECOND_RESOLUTION: &str = "nanosecond"; +pub(crate) const NANO_RESOLUTION: &str = "nano"; +pub(crate) const NS_RESOLUTION: &str = "ns"; +pub(crate) const MICROSECOND_RESOLUTION: &str = "microsecond"; +pub(crate) const MICRO_RESOLUTION: &str = "micro"; +pub(crate) const US_RESOLUTION: &str = "us"; +pub(crate) const MILLISECOND_RESOLUTION: &str = "millisecond"; +pub(crate) const MILLI_RESOLUTION: &str = "milli"; +pub(crate) const MS_RESOLUTION: &str = "ms"; +pub(crate) const SECOND_RESOLUTION: &str = "second"; +pub(crate) const SEC_RESOLUTION: &str = "sec"; +pub(crate) const S_RESOLUTION: &str = "s"; + +pub(crate) const VALID_RESOLUTIONS: [&str; 12] = [ + NANOSECOND_RESOLUTION, + NANO_RESOLUTION, + NS_RESOLUTION, + MICROSECOND_RESOLUTION, + MICRO_RESOLUTION, + US_RESOLUTION, + MILLISECOND_RESOLUTION, + MILLI_RESOLUTION, + MS_RESOLUTION, + SECOND_RESOLUTION, + SEC_RESOLUTION, + S_RESOLUTION, +]; + +impl Epoch { + pub(crate) fn timestamp_nanos(&self) -> i64 { + match self { + Epoch::Nanosecond(v) => *v, + Epoch::Microsecond(v) => *v * 1_000, + Epoch::Millisecond(v) => *v * 1_000_000, + Epoch::Second(v) => *v * 1_000_000_000, + } + } + + pub(crate) fn timestamp_micros(&self) -> i64 { + match self { + Epoch::Nanosecond(v) => *v / 1_000, + Epoch::Microsecond(v) => *v, + Epoch::Millisecond(v) => *v * 1_000, + Epoch::Second(v) => *v * 1_000_000, + } + } + + pub(crate) fn timestamp_millis(&self) -> i64 { + match self { + Epoch::Nanosecond(v) => *v / 1_000_000, + Epoch::Microsecond(v) => *v / 1_000, + Epoch::Millisecond(v) => *v, + Epoch::Second(v) => *v * 1_000, + } + } + + pub(crate) fn timestamp(&self) -> i64 { + match self { + Epoch::Nanosecond(v) => *v / 1_000_000_000, + Epoch::Microsecond(v) => *v / 1_000_000, + Epoch::Millisecond(v) => *v / 1_000, + Epoch::Second(v) => *v, + } + } +} + +impl Default for Epoch { + fn default() -> Self { + Epoch::Nanosecond(chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0)) + } +} + +impl std::fmt::Display for Epoch { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let (value, resolution) = match self { + Epoch::Nanosecond(v) => (v, NANOSECOND_RESOLUTION), + Epoch::Microsecond(v) => (v, MICROSECOND_RESOLUTION), + Epoch::Millisecond(v) => (v, MILLISECOND_RESOLUTION), + Epoch::Second(v) => (v, SECOND_RESOLUTION), + }; + + write!(f, "{}, resolution: {}", value, resolution) + } +} diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs index f5acf8d86af1..57f7bf903a4a 100644 --- a/src/pipeline/src/lib.rs +++ b/src/pipeline/src/lib.rs @@ -1,6 +1,7 @@ -pub mod error; -pub mod table; +mod etl; +mod mng; -pub use pipeline::transform::GreptimeTransformer; -pub use pipeline::value::Value; -pub use pipeline::Pipeline; +pub use etl::transform::GreptimeTransformer; +pub use etl::value::Value; +pub use etl::{parse, Content, Pipeline}; +pub use mng::{error, table}; diff --git a/src/pipeline/src/error.rs b/src/pipeline/src/mng/error.rs similarity index 100% rename from src/pipeline/src/error.rs rename to src/pipeline/src/mng/error.rs diff --git a/src/pipeline/src/mng/mod.rs b/src/pipeline/src/mng/mod.rs new file mode 100644 index 000000000000..91ca63d5bac0 --- /dev/null +++ b/src/pipeline/src/mng/mod.rs @@ -0,0 +1,2 @@ +pub mod error; +pub mod table; diff --git a/src/pipeline/src/table.rs b/src/pipeline/src/mng/table.rs similarity index 99% rename from src/pipeline/src/table.rs rename to src/pipeline/src/mng/table.rs index 681301c13650..34dbc81bb4f5 100644 --- a/src/pipeline/src/table.rs +++ b/src/pipeline/src/mng/table.rs @@ -20,8 +20,6 @@ use datatypes::prelude::ScalarVector; use datatypes::vectors::{StringVector, Vector}; use operator::insert::InserterRef; use operator::statement::StatementExecutorRef; -use pipeline::transform::GreptimeTransformer; -use pipeline::{parse, Content, Pipeline}; use query::plan::LogicalPlan; use query::QueryEngineRef; use session::context::{QueryContextBuilder, QueryContextRef}; @@ -34,6 +32,8 @@ use crate::error::{ BuildDfLogicalPlanSnafu, CastTypeSnafu, CollectRecordsSnafu, ExecuteInternalStatementSnafu, InsertPipelineSnafu, ParsePipelineSnafu, PipelineNotFoundSnafu, Result, }; +use crate::etl::transform::GreptimeTransformer; +use crate::etl::{parse, Content, Pipeline}; pub type PipelineTableRef = Arc; diff --git a/src/pipeline/tests/pipeline.rs b/src/pipeline/tests/pipeline.rs new file mode 100644 index 000000000000..0a597f188fd9 --- /dev/null +++ b/src/pipeline/tests/pipeline.rs @@ -0,0 +1,461 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_telemetry::tracing::info; +use greptime_proto::v1::value::ValueData::{ + BoolValue, F64Value, StringValue, TimestampSecondValue, U32Value, U64Value, U8Value, +}; +use greptime_proto::v1::Value as GreptimeValue; +use pipeline::{parse, Content, GreptimeTransformer, Pipeline, Value}; + +// use pipeline::transform::GreptimeTransformer; +// use pipeline::value::Value; +// use pipeline::{parse, Content, Pipeline}; + +#[test] +fn main() { + let input_value_str = r#" + [ + { + "version": 1, + "streamId": "12345", + "cp": "123456", + "reqId": "1239f220", + "reqTimeSec": "1573840000", + "bytes": "4995", + "cliIP": "128.147.28.68", + "statusCode": "206", + "proto": "HTTPS", + "reqHost": "test.hostname.net", + "reqMethod": "GET", + "reqPath": "/path1/path2/file.ext", + "reqPort": "443", + "rspContentLen": "5000", + "rspContentType": "text/html", + "UA": "Mozilla%2F5.0+%28Macintosh%3B+Intel+Mac+OS+X+10_14_3%29", + "tlsOverheadTimeMSec": "0", + "tlsVersion": "TLSv1", + "objSize": "484", + "uncompressedSize": "484", + "overheadBytes": "232", + "totalBytes": "0", + "queryStr": "cmcd=//1.0@V/bl=21600,br=1426,cid=%22akam-email%22,d=6006,mtp=11100,ot=m,sf=h,sid=%229f36f5c9-d6a2-497b-8c73-4b8f694eab749f36f5c9-d6a2-497b-8c73%22,tb=1426,dl=18500,nor=%22../300kbps/track.m4v%22,nrr=%2212323-48763%22,su,bs,rtp=12000,pr=1.08,sf=d,st=v%22", + "breadcrumbs": "//BC/%5Ba=23.33.41.20,c=g,k=0,l=1%5D", + "accLang": "en-US", + "cookie": "cookie-content", + "range": "37334-42356", + "referer": "https%3A%2F%2Ftest.referrer.net%2Fen-US%2Fdocs%2FWeb%2Ftest", + "xForwardedFor": "8.47.28.38", + "maxAgeSec": "3600", + "reqEndTimeMSec": "3", + "errorCode": "ERR_ACCESS_DENIED|fwd_acl", + "turnAroundTimeMSec": "11", + "transferTimeMSec": "125", + "dnsLookupTimeMSec": "50", + "lastByte": "1", + "edgeIP": "23.50.51.173", + "country": "IN", + "state": "Virginia", + "city": "HERNDON", + "serverCountry": "SG", + "billingRegion": "8", + "cacheStatus": "1", + "securityRules": "ULnR_28976|3900000:3900001:3900005:3900006:BOT-ANOMALY-HEADER|", + "ewUsageInfo": "//4380/4.0/1/-/0/4/#1,2\\//4380/4.0/4/-/0/4/#0,0\\//4380/4.0/5/-/1/1/#0,0", + "ewExecutionInfo": "c:4380:7:161:162:161:n:::12473:200|C:4380:3:0:4:0:n:::6967:200|R:4380:20:99:99:1:n:::35982:200", + "customField": "any-custom-value" + } + ] +"#; + let input_value: Value = serde_json::from_str::(input_value_str) + .expect("failed to parse input value") + .try_into() + .expect("failed to convert input value"); + + let pipeline_yaml = r#" +--- +description: Pipeline for Akamai DataStream2 Log + +processors: + - urlencoding: + fields: + - breadcrumbs + - UA + - referer + - queryStr + method: decode + ignore_missing: true + - epoch: + field: reqTimeSec + resolution: second + ignore_missing: true + - regex: + field: breadcrumbs + patterns: + - "(?\\[[^\\[]*c=c[^\\]]*\\])" + - "(?\\[[^\\[]*c=g[^\\]]*\\])" + - "(?\\[[^\\[]*c=o[^\\]]*\\])" + - "(?\\[[^\\[]*c=p[^\\]]*\\])" + - "(?\\[[^\\[]*c=w[^\\]]*\\])" + ignore_missing: true + - regex: + fields: + - breadcrumbs_parent + - breadcrumbs_edge + - breadcrumbs_origin + - breadcrumbs_peer + - breadcrumbs_cloud_wrapper + ignore_missing: true + patterns: + - "a=(?[^,\\]]+)" + - "b=(?[^,\\]]+)" + - "k=(?[^,\\]]+)" + - "l=(?[^,\\]]+)" + - "m=(?[^,\\]]+)" + - "n=(?[^,\\]]+)" + - "o=(?[^,\\]]+)" + - regex: + field: queryStr, cmcd + patterns: + - "(?i)CMCD=//(?[\\d\\.]+)@V/(?.+$)" + ignore_missing: true + - cmcd: + field: cmcd_data, cmcd + ignore_missing: true + +transform: + - fields: + - breadcrumbs + - referer + - queryStr, query_str + - customField, custom_field + - reqId, req_id + - city + - state + - country + - securityRules, security_rules + - ewUsageInfo, ew_usage_info + - ewExecutionInfo, ew_execution_info + - errorCode, error_code + - xForwardedFor, x_forwarded_for + - range + - accLang, acc_lang + - reqMethod, req_method + - reqHost, req_host + - proto + - cliIP, cli_ip + - rspContentType, rsp_content_type + - tlsVersion, tls_version + type: string + - fields: + - version + - cacheStatus, cache_status + - lastByte, last_byte + type: uint8 + - fields: + - streamId, stream_id + - billingRegion, billing_region + - dnsLookupTimeMSec, dns_lookup_time_msec + - transferTimeMSec, transfer_time_msec + - turnAroundTimeMSec, turn_around_time_msec + - reqEndTimeMSec, req_end_time_msec + - maxAgeSec, max_age_sec + - reqPort, req_port + - statusCode, status_code + - cp + - tlsOverheadTimeMSec, tls_overhead_time_msec + type: uint32 + - fields: + - bytes + - rspContentLen, rsp_content_len + - objSize, obj_size + - uncompressedSize, uncompressed_size + - overheadBytes, overhead_bytes + - totalBytes, total_bytes + type: uint64 + - fields: + - UA, user_agent + - cookie + - reqPath, req_path + type: string + # index: fulltext + - field: reqTimeSec, req_time_sec + # epoch time is special, the resolution MUST BE specified + type: epoch, second + index: timestamp + + # the following is from cmcd + - fields: + - cmcd_version + - cmcd_cid, cmcd_content_id + - cmcd_nor, cmcd_next_object_requests + - cmcd_nrr, cmcd_next_range_request + - cmcd_ot, cmcd_object_type + - cmcd_sf, cmcd_streaming_format + - cmcd_sid, cmcd_session_id + - cmcd_st, cmcd_stream_type + - cmcd_v + type: string + - fields: + - cmcd_br, cmcd_encoded_bitrate + - cmcd_bl, cmcd_buffer_length + - cmcd_d, cmcd_object_duration + - cmcd_dl, cmcd_deadline + - cmcd_mtp, cmcd_measured_throughput + - cmcd_rtp, cmcd_requested_max_throughput + - cmcd_tb, cmcd_top_bitrate + type: uint64 + - fields: + - cmcd_pr, cmcd_playback_rate + type: float64 + - fields: + - cmcd_bs, cmcd_buffer_starvation + - cmcd_su, cmcd_startup + type: boolean + + # the following is from breadcrumbs + - fields: + - breadcrumbs_parent_ip + - breadcrumbs_parent_request_id + - breadcrumbs_parent_geo + - breadcrumbs_edge_ip + - breadcrumbs_edge_request_id + - breadcrumbs_edge_geo + - breadcrumbs_origin_ip + - breadcrumbs_origin_request_id + - breadcrumbs_origin_geo + - breadcrumbs_peer_ip + - breadcrumbs_peer_request_id + - breadcrumbs_peer_geo + - breadcrumbs_cloud_wrapper_ip + - breadcrumbs_cloud_wrapper_request_id + - breadcrumbs_cloud_wrapper_geo + type: string + - fields: + - breadcrumbs_parent_request_end_time + - breadcrumbs_parent_turn_around_time + - breadcrumbs_parent_dns_lookup_time + - breadcrumbs_parent_asn + - breadcrumbs_edge_request_end_time + - breadcrumbs_edge_turn_around_time + - breadcrumbs_edge_dns_lookup_time + - breadcrumbs_edge_asn + - breadcrumbs_origin_request_end_time + - breadcrumbs_origin_turn_around_time + - breadcrumbs_origin_dns_lookup_time + - breadcrumbs_origin_asn + - breadcrumbs_peer_request_end_time + - breadcrumbs_peer_turn_around_time + - breadcrumbs_peer_dns_lookup_time + - breadcrumbs_peer_asn + - breadcrumbs_cloud_wrapper_request_end_time + - breadcrumbs_cloud_wrapper_turn_around_time + - breadcrumbs_cloud_wrapper_dns_lookup_time + - breadcrumbs_cloud_wrapper_asn + type: uint32 +"#; + + let expected_values = vec![ + ( + "breadcrumbs", + Some(StringValue("//BC/[a=23.33.41.20,c=g,k=0,l=1]".into())), + ), + ( + "referer", + Some(StringValue( + "https://test.referrer.net/en-US/docs/Web/test".into(), + )), + ), + ( + "query_str", + Some(StringValue("cmcd=//1.0@V/bl=21600,br=1426,cid=\"akam-email\",d=6006,mtp=11100,ot=m,sf=h,sid=\"9f36f5c9-d6a2-497b-8c73-4b8f694eab749f36f5c9-d6a2-497b-8c73\",tb=1426,dl=18500,nor=\"../300kbps/track.m4v\",nrr=\"12323-48763\",su,bs,rtp=12000,pr=1.08,sf=d,st=v\"".into())), + ), + ("custom_field", Some(StringValue("any-custom-value".into()))), + ("req_id", Some(StringValue("1239f220".into()))), + ("city", Some(StringValue("HERNDON".into()))), + ("state", Some(StringValue("Virginia".into()))), + ("country", Some(StringValue("IN".into()))), + ( + "security_rules", + Some(StringValue( + "ULnR_28976|3900000:3900001:3900005:3900006:BOT-ANOMALY-HEADER|".into(), + )), + ), + ( + "ew_usage_info", + Some(StringValue( + "//4380/4.0/1/-/0/4/#1,2\\//4380/4.0/4/-/0/4/#0,0\\//4380/4.0/5/-/1/1/#0,0".into(), + )), + ), + ( + "ew_execution_info", + Some(StringValue("c:4380:7:161:162:161:n:::12473:200|C:4380:3:0:4:0:n:::6967:200|R:4380:20:99:99:1:n:::35982:200".into()))), + ( + "error_code", + Some(StringValue("ERR_ACCESS_DENIED|fwd_acl".into())), + ), + ("x_forwarded_for", Some(StringValue("8.47.28.38".into()))), + ("range", Some(StringValue("37334-42356".into()))), + ("acc_lang", Some(StringValue("en-US".into()))), + ("req_method", Some(StringValue("GET".into()))), + ("req_host", Some(StringValue("test.hostname.net".into()))), + ("proto", Some(StringValue("HTTPS".into()))), + ("cli_ip", Some(StringValue("128.147.28.68".into()))), + ("rsp_content_type", Some(StringValue("text/html".into()))), + ("tls_version", Some(StringValue("TLSv1".into()))), + ("version", Some(U8Value(1))), + ("cache_status", Some(U8Value(1))), + ("last_byte", Some(U8Value(1))), + ("stream_id", Some(U32Value(12345))), + ("billing_region", Some(U32Value(8))), + ("dns_lookup_time_msec", Some(U32Value(50))), + ("transfer_time_msec", Some(U32Value(125))), + ("turn_around_time_msec", Some(U32Value(11))), + ("req_end_time_msec", Some(U32Value(3))), + ("max_age_sec", Some(U32Value(3600))), + ("req_port", Some(U32Value(443))), + ("status_code", Some(U32Value(206))), + ("cp", Some(U32Value(123456))), + ("tls_overhead_time_msec", Some(U32Value(0))), + ("bytes", Some(U64Value(4995))), + ("rsp_content_len", Some(U64Value(5000))), + ("obj_size", Some(U64Value(484))), + ("uncompressed_size", Some(U64Value(484))), + ("overhead_bytes", Some(U64Value(232))), + ("total_bytes", Some(U64Value(0))), + ( + "user_agent", + Some(StringValue( + "Mozilla/5.0+(Macintosh;+Intel+Mac+OS+X+10_14_3)".into(), + )), + ), + ("cookie", Some(StringValue("cookie-content".into()))), + ( + "req_path", + Some(StringValue("/path1/path2/file.ext".into())), + ), + ("req_time_sec", Some(TimestampSecondValue(1573840000))), + ("cmcd_version", Some(StringValue("1.0".into()))), + ( + "cmcd_content_id", + Some(StringValue("\"akam-email\"".into())), + ), + ( + "cmcd_next_object_requests", + Some(StringValue("\"../300kbps/track.m4v\"".into())), + ), + ( + "cmcd_next_range_request", + Some(StringValue("\"12323-48763\"".into())), + ), + ("cmcd_object_type", Some(StringValue("m".into()))), + ("cmcd_streaming_format", Some(StringValue("d".into()))), + ( + "cmcd_session_id", + Some(StringValue( + "\"9f36f5c9-d6a2-497b-8c73-4b8f694eab749f36f5c9-d6a2-497b-8c73\"".into(), + )), + ), + ("cmcd_stream_type", Some(StringValue("v\"".into()))), + ("cmcd_v", None), + ("cmcd_encoded_bitrate", Some(U64Value(1426))), + ("cmcd_buffer_length", Some(U64Value(21600))), + ("cmcd_object_duration", Some(U64Value(6006))), + ("cmcd_deadline", Some(U64Value(18500))), + ("cmcd_measured_throughput", Some(U64Value(11100))), + ("cmcd_requested_max_throughput", Some(U64Value(12000))), + ("cmcd_top_bitrate", Some(U64Value(1426))), + ("cmcd_playback_rate", Some(F64Value(1.08))), + ("cmcd_buffer_starvation", Some(BoolValue(true))), + ("cmcd_startup", Some(BoolValue(true))), + ("breadcrumbs_parent_ip", None), + ("breadcrumbs_parent_request_id", None), + ("breadcrumbs_parent_geo", None), + ( + "breadcrumbs_edge_ip", + Some(StringValue("23.33.41.20".into())), + ), + ("breadcrumbs_edge_request_id", None), + ("breadcrumbs_edge_geo", None), + ("breadcrumbs_origin_ip", None), + ("breadcrumbs_origin_request_id", None), + ("breadcrumbs_origin_geo", None), + ("breadcrumbs_peer_ip", None), + ("breadcrumbs_peer_request_id", None), + ("breadcrumbs_peer_geo", None), + ("breadcrumbs_cloud_wrapper_ip", None), + ("breadcrumbs_cloud_wrapper_request_id", None), + ("breadcrumbs_cloud_wrapper_geo", None), + ("breadcrumbs_parent_request_end_time", None), + ("breadcrumbs_parent_turn_around_time", None), + ("breadcrumbs_parent_dns_lookup_time", None), + ("breadcrumbs_parent_asn", None), + ("breadcrumbs_edge_request_end_time", Some(U32Value(0))), + ("breadcrumbs_edge_turn_around_time", Some(U32Value(1))), + ("breadcrumbs_edge_dns_lookup_time", None), + ("breadcrumbs_edge_asn", None), + ("breadcrumbs_origin_request_end_time", None), + ("breadcrumbs_origin_turn_around_time", None), + ("breadcrumbs_origin_dns_lookup_time", None), + ("breadcrumbs_origin_asn", None), + ("breadcrumbs_peer_request_end_time", None), + ("breadcrumbs_peer_turn_around_time", None), + ("breadcrumbs_peer_dns_lookup_time", None), + ("breadcrumbs_peer_asn", None), + ("breadcrumbs_cloud_wrapper_request_end_time", None), + ("breadcrumbs_cloud_wrapper_turn_around_time", None), + ("breadcrumbs_cloud_wrapper_dns_lookup_time", None), + ("breadcrumbs_cloud_wrapper_asn", None), + ] + .into_iter() + .map(|(_, d)| GreptimeValue { value_data: d }) + .collect::>(); + + let yaml_content = Content::Yaml(pipeline_yaml.into()); + let pipeline: Pipeline = + parse(&yaml_content).expect("failed to parse pipeline"); + let output = pipeline.exec(input_value).expect("failed to exec pipeline"); + + assert_eq!(output.rows.len(), 1); + let values = output.rows.first().unwrap().values.clone(); + assert_eq!(expected_values, values); + + for s in output.schema.iter() { + info!( + "{}({}): {}", + s.column_name, + s.datatype().as_str_name(), + s.semantic_type().as_str_name() + ); + } + info!("\n"); + + let get_schema_name = |ss: &Vec, i: usize| { + let s = ss.get(i).unwrap(); + s.column_name.clone() + }; + + for row in output.rows.iter() { + let values = &row.values; + for i in 0..values.len() { + let val = values.get(i).unwrap(); + info!( + "{}: {:?}, ", + get_schema_name(&output.schema, i), + val.value_data + ); + } + info!("\n"); + } +} diff --git a/src/script/src/table.rs b/src/script/src/table.rs index 1ba160028d1d..5eacf1ff7ce9 100644 --- a/src/script/src/table.rs +++ b/src/script/src/table.rs @@ -292,7 +292,7 @@ impl ScriptsTable { } /// Build the inserted column schemas -fn build_insert_column_schemas() -> Vec { +fn build_insert_column_schemas() -> Vec { vec![ // The schema that script belongs to. PbColumnSchema { diff --git a/src/servers/src/http/handler.rs b/src/servers/src/http/handler.rs index 302c2186eb89..44c83462c845 100644 --- a/src/servers/src/http/handler.rs +++ b/src/servers/src/http/handler.rs @@ -37,9 +37,7 @@ use session::context::QueryContextRef; use snafu::ResultExt; use super::header::collect_plan_metrics; -use crate::error::{ - Error, InsertLogSnafu, ParseJsonSnafu, UnsupportedContentTypeSnafu, -}; +use crate::error::{Error, InsertLogSnafu, ParseJsonSnafu, UnsupportedContentTypeSnafu}; use crate::http::arrow_result::ArrowResponse; use crate::http::csv_result::CsvResponse; use crate::http::error_result::ErrorResponse; From eb9cd22ba04f51e3b7958bb5f6d014e40481e2f3 Mon Sep 17 00:00:00 2001 From: shuiyisong Date: Tue, 4 Jun 2024 15:36:46 +0800 Subject: [PATCH 06/16] chore: fix typo --- typos.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/typos.toml b/typos.toml index 02f2ed6e695a..e03dc5685a3d 100644 --- a/typos.toml +++ b/typos.toml @@ -2,6 +2,7 @@ Pn = "Pn" ue = "ue" worl = "worl" +ot = "ot" [files] extend-exclude = [ From 8d0595c12c75100b827809803eca7eb51b3660c7 Mon Sep 17 00:00:00 2001 From: shuiyisong Date: Tue, 4 Jun 2024 15:43:14 +0800 Subject: [PATCH 07/16] refactor: bring in pipeline a95c9767d7056ab01dd8ca5fa1214456c6ffc72c --- src/pipeline/src/etl/processor/dissect.rs | 417 ++++++++++++++++++++++ src/pipeline/src/etl/processor/mod.rs | 42 ++- src/pipeline/src/etl/value/map.rs | 6 + 3 files changed, 457 insertions(+), 8 deletions(-) create mode 100644 src/pipeline/src/etl/processor/dissect.rs diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs new file mode 100644 index 000000000000..06d49bbfeceb --- /dev/null +++ b/src/pipeline/src/etl/processor/dissect.rs @@ -0,0 +1,417 @@ +// Copyright 2024 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_telemetry::warn; + +use crate::etl::field::{Field, Fields}; +use crate::etl::processor::{ + yaml_bool, yaml_field, yaml_fields, yaml_parse_strings, yaml_string, Processor, FIELDS_NAME, + FIELD_NAME, IGNORE_MISSING_NAME, PATTERNS_NAME, +}; +use crate::etl::value::{Map, Value}; + +pub(crate) const PROCESSOR_DISSECT: &str = "dissect"; + +const APPEND_SEPARATOR_NAME: &str = "append_separator"; + +#[derive(Debug, PartialEq)] +enum Part { + Split(String), + Key(String), +} + +impl Part { + fn is_empty(&self) -> bool { + match self { + Part::Split(v) => v.is_empty(), + Part::Key(v) => v.is_empty(), + } + } + + fn empty_split() -> Self { + Part::Split(String::new()) + } + + fn empty_key() -> Self { + Part::Key(String::new()) + } +} + +impl std::ops::Deref for Part { + type Target = String; + + fn deref(&self) -> &Self::Target { + match self { + Part::Split(v) => v, + Part::Key(v) => v, + } + } +} + +impl std::ops::DerefMut for Part { + fn deref_mut(&mut self) -> &mut Self::Target { + match self { + Part::Split(v) => v, + Part::Key(v) => v, + } + } +} + +#[derive(Debug, Default)] +struct Pattern { + origin: String, + parts: Vec, +} + +impl std::ops::Deref for Pattern { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.parts + } +} + +impl std::ops::DerefMut for Pattern { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.parts + } +} + +impl std::str::FromStr for Pattern { + type Err = String; + + fn from_str(s: &str) -> Result { + let mut parts = vec![]; + let mut cursor = Part::empty_split(); + + let origin = s.to_string(); + let mut last_ch = None; + let chars: Vec = origin.chars().collect(); + + for i in 0..chars.len() { + let ch = chars[i]; + match (ch, &mut cursor) { + // if cursor is Split part, and found %{, then ready to start a Key part + ('%', Part::Split(_)) if i + 1 < chars.len() && chars[i + 1] == '{' => {} + // if cursor is Split part, and found %{, then end the Split part, start the Key part + ('{', Part::Split(_)) if last_ch == Some('%') => { + if !cursor.is_empty() { + parts.push(cursor); + } + + cursor = Part::empty_key(); + } + // if cursor is Split part, and not found % or {, then continue the Split part + (_, Part::Split(_)) => { + cursor.push(ch); + } + // if cursor is Key part, and found }, then end the Key part, start the next Split part + ('}', Part::Key(_)) => { + parts.push(cursor); + cursor = Part::empty_split(); + } + (_, Part::Key(_)) if !is_valid_char(ch) => { + return Err(format!("Invalid character in key: '{ch}'")); + } + (_, Part::Key(_)) => { + cursor.push(ch); + } + } + + last_ch = Some(ch); + } + + let pattern = Self { parts, origin }; + pattern.check()?; + Ok(pattern) + } +} + +impl Pattern { + fn check(&self) -> Result<(), String> { + if self.len() == 0 { + return Err("Empty pattern is not allowed".to_string()); + } + + for i in 0..self.len() { + let this_part = &self[i]; + let next_part = self.get(i + 1); + match (this_part, next_part) { + (Part::Split(split), _) if split.is_empty() => { + return Err("Empty split is not allowed".to_string()); + } + (Part::Key(key1), Some(Part::Key(key2))) => { + return Err(format!( + "consecutive keys are not allowed: '{key1}' '{key2}'" + )); + } + _ => {} + } + } + Ok(()) + } +} + +impl std::fmt::Display for Pattern { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.origin) + } +} + +#[derive(Debug, Default)] +pub struct DissectProcessor { + fields: Fields, + patterns: Vec, + ignore_missing: bool, + + // The character(s) that separate the appended fields. Default is an empty string. + append_separator: Option, +} + +impl DissectProcessor { + fn with_fields(&mut self, fields: Fields) { + self.fields = fields; + } + + fn with_ignore_missing(&mut self, ignore_missing: bool) { + self.ignore_missing = ignore_missing; + } + + fn with_patterns(&mut self, patterns: Vec) { + self.patterns = patterns; + } + + fn with_append_separator(&mut self, append_separator: String) { + self.append_separator = Some(append_separator); + } + + fn process_pattern(chs: &[char], pattern: &Pattern) -> Result { + let mut map = Map::default(); + let mut pos = 0; + + for i in 0..pattern.len() { + let this_part = &pattern[i]; + let next_part = pattern.get(i + 1); + match (this_part, next_part) { + (Part::Split(split), _) => { + let split_chs = split.chars().collect::>(); + let split_len = split_chs.len(); + if pos + split_len > chs.len() { + return Err(format!("'{split}' exceeds the input",)); + } + + if &chs[pos..pos + split_len] != split_chs.as_slice() { + return Err(format!( + "'{split}' does not match the input '{}'", + chs[pos..pos + split_len].iter().collect::() + )); + } + + pos += split_len; + } + (Part::Key(key), None) => { + let value = chs[pos..].iter().collect::(); + map.insert(key.clone(), Value::String(value)); + } + + (Part::Key(key), Some(Part::Split(split))) => match split.chars().next() { + None => return Err("Empty split is not allowed".to_string()), + Some(stop) => { + let mut end = pos; + while end < chs.len() && chs[end] != stop { + end += 1; + } + + if end == chs.len() { + return Err("No matching split found".to_string()); + } + + let value = chs[pos..end].iter().collect::(); + map.insert(key.clone(), Value::String(value)); + pos = end; + } + }, + (Part::Key(key1), Some(Part::Key(key2))) => { + return Err(format!( + "consecutive keys are not allowed: '{key1}' '{key2}'" + )); + } + } + } + + Ok(map) + } + + fn process(&self, val: &str) -> Result { + let chs = val.chars().collect::>(); + + for pattern in &self.patterns { + if let Ok(map) = DissectProcessor::process_pattern(&chs, pattern) { + return Ok(map); + } + } + + Err("No matching pattern found".to_string()) + } +} + +impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessor { + type Error = String; + + fn try_from(value: &yaml_rust::yaml::Hash) -> Result { + let mut processor = Self::default(); + + for (k, v) in value.iter() { + let key = k + .as_str() + .ok_or(format!("key must be a string, but got '{k:?}'"))?; + + match key { + FIELD_NAME => processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)), + FIELDS_NAME => processor.with_fields(yaml_fields(v, FIELDS_NAME)?), + PATTERNS_NAME => { + let patterns = yaml_parse_strings(v, PATTERNS_NAME)?; + processor.with_patterns(patterns); + } + IGNORE_MISSING_NAME => { + processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?) + } + APPEND_SEPARATOR_NAME => { + processor.with_append_separator(yaml_string(v, APPEND_SEPARATOR_NAME)?) + } + _ => {} + } + } + + Ok(processor) + } +} + +impl Processor for DissectProcessor { + fn kind(&self) -> &str { + PROCESSOR_DISSECT + } + + fn ignore_missing(&self) -> bool { + self.ignore_missing + } + + fn fields(&self) -> &Fields { + &self.fields + } + + fn exec_field(&self, val: &Value, _field: &Field) -> Result { + match val { + Value::String(val) => match self.process(val) { + Ok(map) => Ok(map), + Err(e) => { + warn!("dissect processor: {}", e); + Ok(Map::default()) + } + }, + _ => Err(format!( + "{} processor: expect string value, but got {val:?}", + self.kind() + )), + } + } +} + +fn is_valid_char(ch: char) -> bool { + ch.is_alphanumeric() || ch == '_' +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use super::{DissectProcessor, Part, Pattern}; + use crate::etl::value::{Map, Value}; + + #[test] + fn test_pattern() { + let cases = [( + "%{clientip} %{ident} %{auth} [%{timestamp}] \"%{verb} %{request} HTTP/%{httpversion}\" %{status} %{size}", + vec![ + Part::Key("clientip".chars().collect()), + Part::Split(" ".chars().collect()), + Part::Key("ident".chars().collect()), + Part::Split(" ".chars().collect()), + Part::Key("auth".chars().collect()), + Part::Split(" [".chars().collect()), + Part::Key("timestamp".chars().collect()), + Part::Split("] \"".chars().collect()), + Part::Key("verb".chars().collect()), + Part::Split(" ".chars().collect()), + Part::Key("request".chars().collect()), + Part::Split(" HTTP/".chars().collect()), + Part::Key("httpversion".chars().collect()), + Part::Split("\" ".chars().collect()), + Part::Key("status".chars().collect()), + Part::Split(" ".chars().collect()), + Part::Key("size".chars().collect()), + ], + )]; + + for (pattern, expected) in cases.into_iter() { + let p: Pattern = pattern.parse().unwrap(); + assert_eq!(p.parts, expected); + } + } + + #[test] + fn test_process() { + let assert = |pattern_str: &str, input: &str, expected: HashMap| { + let chs = input.chars().collect::>(); + let pattern = pattern_str.parse().unwrap(); + let map = DissectProcessor::process_pattern(&chs, &pattern).unwrap(); + + assert_eq!(map, Map::from(expected)); + }; + + let expected = [ + ("timestamp", "30/Apr/1998:22:00:52 +0000"), + ("status", "200"), + ("clientip", "1.2.3.4"), + ("ident", "-"), + ("size", "3171"), + ( + "request", + "/english/venues/cities/images/montpellier/18.gif", + ), + ("auth", "-"), + ("verb", "GET"), + ("httpversion", "1.0"), + ] + .into_iter() + .map(|(k, v)| (k.to_string(), Value::String(v.to_string()))) + .collect::>(); + + { + // pattern start with Key + let pattern_str = "%{clientip} %{ident} %{auth} [%{timestamp}] \"%{verb} %{request} HTTP/%{httpversion}\" %{status} %{size}"; + let input = "1.2.3.4 - - [30/Apr/1998:22:00:52 +0000] \"GET /english/venues/cities/images/montpellier/18.gif HTTP/1.0\" 200 3171"; + + assert(pattern_str, input, expected.clone()); + } + + { + // pattern start with Split + let pattern_str = " %{clientip} %{ident} %{auth} [%{timestamp}] \"%{verb} %{request} HTTP/%{httpversion}\" %{status} %{size}"; + let input = " 1.2.3.4 - - [30/Apr/1998:22:00:52 +0000] \"GET /english/venues/cities/images/montpellier/18.gif HTTP/1.0\" 200 3171"; + + assert(pattern_str, input, expected); + } + } +} diff --git a/src/pipeline/src/etl/processor/mod.rs b/src/pipeline/src/etl/processor/mod.rs index c04414b87cbd..81f324a43285 100644 --- a/src/pipeline/src/etl/processor/mod.rs +++ b/src/pipeline/src/etl/processor/mod.rs @@ -15,6 +15,7 @@ pub mod cmcd; pub mod csv; pub mod date; +pub mod dissect; pub mod epoch; pub mod letter; pub mod regex; @@ -26,6 +27,7 @@ use cmcd::CMCDProcessor; use common_telemetry::warn; use csv::CsvProcessor; use date::DateProcessor; +use dissect::DissectProcessor; use epoch::EpochProcessor; use letter::LetterProcessor; use regex::RegexProcessor; @@ -38,6 +40,7 @@ const FIELD_NAME: &str = "field"; const FIELDS_NAME: &str = "fields"; const IGNORE_MISSING_NAME: &str = "ignore_missing"; const METHOD_NAME: &str = "method"; +const PATTERNS_NAME: &str = "patterns"; // const IF_NAME: &str = "if"; // const IGNORE_FAILURE_NAME: &str = "ignore_failure"; @@ -53,14 +56,14 @@ pub trait Processor: std::fmt::Debug + Send + Sync + 'static { true } - fn exec_field(&self, _val: &Value, _field: &Field) -> Result { - Ok(Map::default()) + /// default behavior does nothing and returns the input value + fn exec_field(&self, val: &Value, field: &Field) -> Result { + Ok(Map::one(field.get_field(), val.clone())) } fn exec_map(&self, mut map: Map) -> Result { for ff @ Field { field, .. } in self.fields().iter() { - let val = map.get(field); - match val { + match map.get(field) { Some(v) => { map.extend(self.exec_field(v, ff)?); } @@ -158,6 +161,7 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result, String> cmcd::PROCESSOR_CMCD => Arc::new(CMCDProcessor::try_from(value)?), csv::PROCESSOR_CSV => Arc::new(CsvProcessor::try_from(value)?), date::PROCESSOR_DATE => Arc::new(DateProcessor::try_from(value)?), + dissect::PROCESSOR_DISSECT => Arc::new(DissectProcessor::try_from(value)?), epoch::PROCESSOR_EPOCH => Arc::new(EpochProcessor::try_from(value)?), letter::PROCESSOR_LETTER => Arc::new(LetterProcessor::try_from(value)?), regex::PROCESSOR_REGEX => Arc::new(RegexProcessor::try_from(value)?), @@ -188,11 +192,33 @@ pub(crate) fn yaml_bool(v: &yaml_rust::Yaml, field: &str) -> Result Result { - yaml_string(v, field)?.parse() +pub(crate) fn yaml_parse_string(v: &yaml_rust::Yaml, field: &str) -> Result +where + T: std::str::FromStr, + T::Err: ToString, +{ + yaml_string(v, field)? + .parse::() + .map_err(|e| e.to_string()) +} + +pub(crate) fn yaml_parse_strings(v: &yaml_rust::Yaml, field: &str) -> Result, String> +where + T: std::str::FromStr, + T::Err: ToString, +{ + yaml_strings(v, field).and_then(|v| { + v.into_iter() + .map(|s| s.parse::().map_err(|e| e.to_string())) + .collect() + }) } pub(crate) fn yaml_fields(v: &yaml_rust::Yaml, field: &str) -> Result { - let ff = yaml_strings(v, field).and_then(|v| v.into_iter().map(|s| s.parse()).collect())?; - Fields::new(ff) + let v = yaml_parse_strings(v, field)?; + Fields::new(v) +} + +pub(crate) fn yaml_field(v: &yaml_rust::Yaml, field: &str) -> Result { + yaml_parse_string(v, field) } diff --git a/src/pipeline/src/etl/value/map.rs b/src/pipeline/src/etl/value/map.rs index 3b03ab311fb0..8e41c92fc0dd 100644 --- a/src/pipeline/src/etl/value/map.rs +++ b/src/pipeline/src/etl/value/map.rs @@ -37,6 +37,12 @@ impl Map { } } +impl From> for Map { + fn from(values: HashMap) -> Self { + Map { values } + } +} + impl std::ops::Deref for Map { type Target = HashMap; From 061b14e4c787261e0995ad7510efda234e2e9a8a Mon Sep 17 00:00:00 2001 From: shuiyisong Date: Tue, 4 Jun 2024 15:48:19 +0800 Subject: [PATCH 08/16] chore: fix typo and license header --- src/pipeline/src/etl/field.rs | 2 +- src/pipeline/src/etl/mod.rs | 2 +- src/pipeline/src/etl/processor/cmcd.rs | 2 +- src/pipeline/src/etl/processor/csv.rs | 2 +- src/pipeline/src/etl/processor/date.rs | 2 +- src/pipeline/src/etl/processor/dissect.rs | 2 +- src/pipeline/src/etl/processor/epoch.rs | 2 +- src/pipeline/src/etl/processor/letter.rs | 2 +- src/pipeline/src/etl/processor/mod.rs | 2 +- src/pipeline/src/etl/processor/regex.rs | 2 +- src/pipeline/src/etl/processor/urlencoding.rs | 2 +- src/pipeline/src/etl/transform/index.rs | 2 +- src/pipeline/src/etl/transform/mod.rs | 2 +- .../etl/transform/transformer/greptime/coerce.rs | 2 +- .../src/etl/transform/transformer/greptime/mod.rs | 2 +- src/pipeline/src/etl/transform/transformer/mod.rs | 2 +- src/pipeline/src/etl/transform/transformer/noop.rs | 2 +- src/pipeline/src/etl/value/array.rs | 2 +- src/pipeline/src/etl/value/map.rs | 2 +- src/pipeline/src/etl/value/mod.rs | 2 +- src/pipeline/src/etl/value/time.rs | 2 +- src/pipeline/src/lib.rs | 14 ++++++++++++++ src/pipeline/src/mng/mod.rs | 14 ++++++++++++++ src/pipeline/src/mng/table.rs | 14 ++++++++++++++ src/pipeline/tests/pipeline.rs | 2 +- src/servers/src/error.rs | 8 ++++---- src/servers/src/http/handler.rs | 2 +- 27 files changed, 69 insertions(+), 27 deletions(-) diff --git a/src/pipeline/src/etl/field.rs b/src/pipeline/src/etl/field.rs index 9d76b540953f..34181be5f4a3 100644 --- a/src/pipeline/src/etl/field.rs +++ b/src/pipeline/src/etl/field.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/mod.rs b/src/pipeline/src/etl/mod.rs index 74c6cd96c547..4e4595479482 100644 --- a/src/pipeline/src/etl/mod.rs +++ b/src/pipeline/src/etl/mod.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/processor/cmcd.rs b/src/pipeline/src/etl/processor/cmcd.rs index 7001ddb5b493..256d6f05da11 100644 --- a/src/pipeline/src/etl/processor/cmcd.rs +++ b/src/pipeline/src/etl/processor/cmcd.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/processor/csv.rs b/src/pipeline/src/etl/processor/csv.rs index 1cd110922892..ae578d79e1f6 100644 --- a/src/pipeline/src/etl/processor/csv.rs +++ b/src/pipeline/src/etl/processor/csv.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/processor/date.rs b/src/pipeline/src/etl/processor/date.rs index 6715522793a7..9c4037900a7c 100644 --- a/src/pipeline/src/etl/processor/date.rs +++ b/src/pipeline/src/etl/processor/date.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs index 06d49bbfeceb..2af008ceac86 100644 --- a/src/pipeline/src/etl/processor/dissect.rs +++ b/src/pipeline/src/etl/processor/dissect.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/processor/epoch.rs b/src/pipeline/src/etl/processor/epoch.rs index feee2fa8d717..96a8695c9f76 100644 --- a/src/pipeline/src/etl/processor/epoch.rs +++ b/src/pipeline/src/etl/processor/epoch.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/processor/letter.rs b/src/pipeline/src/etl/processor/letter.rs index 1c2fcf9eacfc..e533536769d2 100644 --- a/src/pipeline/src/etl/processor/letter.rs +++ b/src/pipeline/src/etl/processor/letter.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/processor/mod.rs b/src/pipeline/src/etl/processor/mod.rs index 81f324a43285..96e8a629f252 100644 --- a/src/pipeline/src/etl/processor/mod.rs +++ b/src/pipeline/src/etl/processor/mod.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/processor/regex.rs b/src/pipeline/src/etl/processor/regex.rs index 078deef603b5..8aba43436155 100644 --- a/src/pipeline/src/etl/processor/regex.rs +++ b/src/pipeline/src/etl/processor/regex.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/processor/urlencoding.rs b/src/pipeline/src/etl/processor/urlencoding.rs index f9019fd19126..c0d1669f85de 100644 --- a/src/pipeline/src/etl/processor/urlencoding.rs +++ b/src/pipeline/src/etl/processor/urlencoding.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/transform/index.rs b/src/pipeline/src/etl/transform/index.rs index b554824f52a3..674df720f8c3 100644 --- a/src/pipeline/src/etl/transform/index.rs +++ b/src/pipeline/src/etl/transform/index.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/transform/mod.rs b/src/pipeline/src/etl/transform/mod.rs index 1c1a0f4141de..991aa05df644 100644 --- a/src/pipeline/src/etl/transform/mod.rs +++ b/src/pipeline/src/etl/transform/mod.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs index 48b612e3d53a..6b077a22dca5 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/transform/transformer/greptime/mod.rs b/src/pipeline/src/etl/transform/transformer/greptime/mod.rs index 933f621d6a14..bbbfa0e9104b 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime/mod.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime/mod.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/transform/transformer/mod.rs b/src/pipeline/src/etl/transform/transformer/mod.rs index 173aac61bfe5..87bd16b4feca 100644 --- a/src/pipeline/src/etl/transform/transformer/mod.rs +++ b/src/pipeline/src/etl/transform/transformer/mod.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/transform/transformer/noop.rs b/src/pipeline/src/etl/transform/transformer/noop.rs index 6bd7a208c981..40b82d5ee760 100644 --- a/src/pipeline/src/etl/transform/transformer/noop.rs +++ b/src/pipeline/src/etl/transform/transformer/noop.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/value/array.rs b/src/pipeline/src/etl/value/array.rs index 617d9beed348..a401cf00ab67 100644 --- a/src/pipeline/src/etl/value/array.rs +++ b/src/pipeline/src/etl/value/array.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/value/map.rs b/src/pipeline/src/etl/value/map.rs index 8e41c92fc0dd..47041f15350d 100644 --- a/src/pipeline/src/etl/value/map.rs +++ b/src/pipeline/src/etl/value/map.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/value/mod.rs b/src/pipeline/src/etl/value/mod.rs index a9d7c34feba1..a8daa5fa6149 100644 --- a/src/pipeline/src/etl/value/mod.rs +++ b/src/pipeline/src/etl/value/mod.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/etl/value/time.rs b/src/pipeline/src/etl/value/time.rs index 5dbc4d4e092e..cca883f33231 100644 --- a/src/pipeline/src/etl/value/time.rs +++ b/src/pipeline/src/etl/value/time.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs index 57f7bf903a4a..ce2f4f6d78a2 100644 --- a/src/pipeline/src/lib.rs +++ b/src/pipeline/src/lib.rs @@ -1,3 +1,17 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + mod etl; mod mng; diff --git a/src/pipeline/src/mng/mod.rs b/src/pipeline/src/mng/mod.rs index 91ca63d5bac0..cb4854dc181e 100644 --- a/src/pipeline/src/mng/mod.rs +++ b/src/pipeline/src/mng/mod.rs @@ -1,2 +1,16 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + pub mod error; pub mod table; diff --git a/src/pipeline/src/mng/table.rs b/src/pipeline/src/mng/table.rs index 34dbc81bb4f5..7863982a66b0 100644 --- a/src/pipeline/src/mng/table.rs +++ b/src/pipeline/src/mng/table.rs @@ -1,3 +1,17 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + use std::collections::HashMap; use std::sync::{Arc, RwLock}; diff --git a/src/pipeline/tests/pipeline.rs b/src/pipeline/tests/pipeline.rs index 0a597f188fd9..869bd13c78f3 100644 --- a/src/pipeline/tests/pipeline.rs +++ b/src/pipeline/tests/pipeline.rs @@ -1,4 +1,4 @@ -// Copyright 2024 Greptime Team +// Copyright 2023 Greptime Team // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs index f4f4e277764f..d4a849496817 100644 --- a/src/servers/src/error.rs +++ b/src/servers/src/error.rs @@ -558,13 +558,13 @@ pub enum Error { location: Location, }, - #[snafu(display("Failed to conver to structed log"))] - ToStructedLog { + #[snafu(display("Failed to convert to structured log"))] + ToStructuredLog { #[snafu(implicit)] location: Location, }, - #[snafu(display("Unsupport content type: {:?}", content_type))] + #[snafu(display("Unsupported content type: {:?}", content_type))] UnsupportedContentType { content_type: ContentType, #[snafu(implicit)] @@ -685,7 +685,7 @@ impl ErrorExt for Error { | MysqlValueConversion { .. } | UnexpectedPhysicalTable { .. } | ParseJson { .. } - | ToStructedLog { .. } + | ToStructuredLog { .. } | UnsupportedContentType { .. } | InsertLog { .. } | TimestampOverflow { .. } => StatusCode::InvalidArguments, diff --git a/src/servers/src/http/handler.rs b/src/servers/src/http/handler.rs index 44c83462c845..fd9623864e3f 100644 --- a/src/servers/src/http/handler.rs +++ b/src/servers/src/http/handler.rs @@ -79,7 +79,7 @@ pub struct LogIngesterQueryParams { } fn parse_space_separated_log(payload: String) -> Result { - // ToStructedLogSnafu + // ToStructuredLogSnafu let _log = payload.split_whitespace().collect::>(); // TODO (qtang): implement this todo!() From 51e2b6e728de398714f64ca2c6d90aee78c3b45c Mon Sep 17 00:00:00 2001 From: Kelvin Wu Date: Tue, 4 Jun 2024 16:10:44 +0800 Subject: [PATCH 09/16] fix: display the PartitionBound and PartitionDef correctly (#4101) * fix: display the PartitionBound and PartitionDef correctly * Update src/partition/src/partition.rs Co-authored-by: dennis zhuang * fix: fix unit test of partition definition --------- Co-authored-by: dennis zhuang --- src/partition/src/partition.rs | 7 +++--- .../cases/standalone/common/partition.result | 24 +++++++++---------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/partition/src/partition.rs b/src/partition/src/partition.rs index 6735ccf6d9f1..28cda6a817b4 100644 --- a/src/partition/src/partition.rs +++ b/src/partition/src/partition.rs @@ -63,7 +63,7 @@ impl Display for PartitionBound { match self { Self::Value(v) => write!(f, "{}", v), Self::MaxValue => write!(f, "MAXVALUE"), - Self::Expr(e) => write!(f, "{:?}", e), + Self::Expr(e) => write!(f, "{}", e), } } } @@ -72,8 +72,7 @@ impl Display for PartitionDef { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( f, - "({}) VALUES LESS THAN ({})", - self.partition_columns.iter().join(", "), + "{}", self.partition_bounds .iter() .map(|b| format!("{b}")) @@ -188,7 +187,7 @@ mod tests { PartitionBound::Value(1_i32.into()), ], }; - assert_eq!("(a, b) VALUES LESS THAN (MAXVALUE, 1)", def.to_string()); + assert_eq!("MAXVALUE, 1", def.to_string()); let partition: MetaPartition = def.try_into().unwrap(); assert_eq!( diff --git a/tests/cases/standalone/common/partition.result b/tests/cases/standalone/common/partition.result index 9c76a87df100..53b30056b879 100644 --- a/tests/cases/standalone/common/partition.result +++ b/tests/cases/standalone/common/partition.result @@ -14,13 +14,13 @@ Affected Rows: 0 -- SQLNESS REPLACE (\d{13}) ID SELECT table_catalog, table_schema, table_name, partition_name, partition_expression, greptime_partition_id from information_schema.partitions WHERE table_name = 'my_table' ORDER BY table_catalog, table_schema, table_name, partition_name; -+---------------+--------------+------------+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+ -| table_catalog | table_schema | table_name | partition_name | partition_expression | greptime_partition_id | -+---------------+--------------+------------+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+ -| greptime | public | my_table | p0 | (a) VALUES LESS THAN (PartitionExpr { lhs: Column("a"), op: Lt, rhs: Value(Int32(1000)) }) | ID | -| greptime | public | my_table | p1 | (a) VALUES LESS THAN (PartitionExpr { lhs: Column("a"), op: GtEq, rhs: Value(Int32(2000)) }) | ID | -| greptime | public | my_table | p2 | (a) VALUES LESS THAN (PartitionExpr { lhs: Expr(PartitionExpr { lhs: Column("a"), op: GtEq, rhs: Value(Int32(1000)) }), op: And, rhs: Expr(PartitionExpr { lhs: Column("a"), op: Lt, rhs: Value(Int32(2000)) }) }) | ID | -+---------------+--------------+------------+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+ ++---------------+--------------+------------+----------------+------------------------+-----------------------+ +| table_catalog | table_schema | table_name | partition_name | partition_expression | greptime_partition_id | ++---------------+--------------+------------+----------------+------------------------+-----------------------+ +| greptime | public | my_table | p0 | a < 1000 | ID | +| greptime | public | my_table | p1 | a >= 2000 | ID | +| greptime | public | my_table | p2 | a >= 1000 AND a < 2000 | ID | ++---------------+--------------+------------+----------------+------------------------+-----------------------+ -- SQLNESS REPLACE (\d{13}) REGION_ID -- SQLNESS REPLACE (\d{1}) PEER_ID @@ -120,11 +120,11 @@ Affected Rows: 0 -- SQLNESS REPLACE (\d{13}) ID SELECT table_catalog, table_schema, table_name, partition_name, partition_expression, greptime_partition_id from information_schema.partitions WHERE table_name = 'my_table' ORDER BY table_catalog, table_schema, table_name, partition_name; -+---------------+--------------+------------+----------------+---------------------------------+-----------------------+ -| table_catalog | table_schema | table_name | partition_name | partition_expression | greptime_partition_id | -+---------------+--------------+------------+----------------+---------------------------------+-----------------------+ -| greptime | public | my_table | p0 | (a) VALUES LESS THAN (MAXVALUE) | ID | -+---------------+--------------+------------+----------------+---------------------------------+-----------------------+ ++---------------+--------------+------------+----------------+----------------------+-----------------------+ +| table_catalog | table_schema | table_name | partition_name | partition_expression | greptime_partition_id | ++---------------+--------------+------------+----------------+----------------------+-----------------------+ +| greptime | public | my_table | p0 | MAXVALUE | ID | ++---------------+--------------+------------+----------------+----------------------+-----------------------+ -- SQLNESS REPLACE (\d{13}) REGION_ID -- SQLNESS REPLACE (\d{1}) PEER_ID From b3a4362626cd2c213f5f996b4d95a7d7d0c39119 Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Tue, 4 Jun 2024 16:11:15 +0800 Subject: [PATCH 10/16] test: run `test_flush_reopen_region` and `test_region_replay` with `KafkaLogStore` (#4083) * feat: add `LogStoreFactory` to `TestEnv` * feat: add `multiple_log_store_factories` template * test: run `test_flush_reopen_region` and `test_region_replay` with `KafkaLogStore` * chore: move deps to workspace * chore: apply suggestions from CR --- Cargo.lock | 31 +- Cargo.toml | 2 + src/log-store/src/test_util/log_store_util.rs | 14 + src/mito2/Cargo.toml | 9 +- src/mito2/src/engine/basic_test.rs | 38 +- src/mito2/src/engine/flush_test.rs | 41 +- src/mito2/src/test_util.rs | 394 ++++++++++++++---- tests-integration/Cargo.toml | 4 +- tests-integration/src/lib.rs | 5 - 9 files changed, 422 insertions(+), 116 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 18fa00347ff4..d722b1e9f2ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5834,6 +5834,7 @@ dependencies = [ "datafusion-common 38.0.0", "datafusion-expr 38.0.0", "datatypes", + "dotenv", "futures", "humantime-serde", "index", @@ -5851,6 +5852,9 @@ dependencies = [ "puffin", "rand", "regex", + "rskafka", + "rstest", + "rstest_reuse", "serde", "serde_json", "serde_with", @@ -8256,6 +8260,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "relative-path" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" + [[package]] name = "rend" version = "0.4.2" @@ -8522,9 +8532,9 @@ dependencies = [ [[package]] name = "rstest" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de1bb486a691878cd320c2f0d319ba91eeaa2e894066d8b5f8f117c000e9d962" +checksum = "9afd55a67069d6e434a95161415f5beeada95a01c7b815508a82dcb0e1593682" dependencies = [ "futures", "futures-timer", @@ -8534,28 +8544,31 @@ dependencies = [ [[package]] name = "rstest_macros" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290ca1a1c8ca7edb7c3283bd44dc35dd54fdec6253a3912e201ba1072018fca8" +checksum = "4165dfae59a39dd41d8dec720d3cbfbc71f69744efb480a3920f5d4e0cc6798d" dependencies = [ "cfg-if", + "glob", + "proc-macro-crate 3.1.0", "proc-macro2", "quote", + "regex", + "relative-path", "rustc_version", - "syn 1.0.109", + "syn 2.0.66", "unicode-ident", ] [[package]] name = "rstest_reuse" -version = "0.5.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45f80dcc84beab3a327bbe161f77db25f336a1452428176787c8c79ac79d7073" +checksum = "b3a8fb4672e840a587a66fc577a5491375df51ddb88f2a2c2a792598c326fe14" dependencies = [ "quote", "rand", - "rustc_version", - "syn 1.0.109", + "syn 2.0.66", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 9623dfb90116..bdfa94e8a639 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -153,6 +153,8 @@ reqwest = { version = "0.12", default-features = false, features = [ "multipart", ] } rskafka = "0.5" +rstest = "0.21" +rstest_reuse = "0.7" rust_decimal = "1.33" schemars = "0.8" serde = { version = "1.0", features = ["derive"] } diff --git a/src/log-store/src/test_util/log_store_util.rs b/src/log-store/src/test_util/log_store_util.rs index c25b411665b3..e714088d89ec 100644 --- a/src/log-store/src/test_util/log_store_util.rs +++ b/src/log-store/src/test_util/log_store_util.rs @@ -13,10 +13,13 @@ // limitations under the License. use std::path::Path; +use std::time::Duration; use common_base::readable_size::ReadableSize; +use common_wal::config::kafka::DatanodeKafkaConfig; use common_wal::config::raft_engine::RaftEngineConfig; +use crate::kafka::log_store::KafkaLogStore; use crate::raft_engine::log_store::RaftEngineLogStore; /// Create a write log for the provided path, used for test. @@ -28,3 +31,14 @@ pub async fn create_tmp_local_file_log_store>(path: P) -> RaftEng }; RaftEngineLogStore::try_new(path, cfg).await.unwrap() } + +/// Create a [KafkaLogStore]. +pub async fn create_kafka_log_store(broker_endpoints: Vec) -> KafkaLogStore { + KafkaLogStore::try_new(&DatanodeKafkaConfig { + broker_endpoints, + linger: Duration::from_millis(1), + ..Default::default() + }) + .await + .unwrap() +} diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml index f9fdb5b574a4..f3a747dcd02f 100644 --- a/src/mito2/Cargo.toml +++ b/src/mito2/Cargo.toml @@ -6,7 +6,7 @@ license.workspace = true [features] default = [] -test = ["common-test-util", "log-store"] +test = ["common-test-util", "log-store", "rstest", "rstest_reuse", "rskafka"] [lints] workspace = true @@ -37,6 +37,7 @@ datafusion.workspace = true datafusion-common.workspace = true datafusion-expr.workspace = true datatypes.workspace = true +dotenv.workspace = true futures.workspace = true humantime-serde.workspace = true index.workspace = true @@ -54,6 +55,9 @@ prost.workspace = true puffin.workspace = true rand.workspace = true regex = "1.5" +rskafka = { workspace = true, optional = true } +rstest = { workspace = true, optional = true } +rstest_reuse = { workspace = true, optional = true } serde.workspace = true serde_json.workspace = true serde_with.workspace = true @@ -73,6 +77,9 @@ common-test-util.workspace = true criterion = "0.4" log-store.workspace = true object-store = { workspace = true, features = ["services-memory"] } +rskafka.workspace = true +rstest.workspace = true +rstest_reuse.workspace = true toml.workspace = true [[bench]] diff --git a/src/mito2/src/engine/basic_test.rs b/src/mito2/src/engine/basic_test.rs index 9a5cca209b7a..439b3a2fe0d3 100644 --- a/src/mito2/src/engine/basic_test.rs +++ b/src/mito2/src/engine/basic_test.rs @@ -22,8 +22,11 @@ use common_base::readable_size::ReadableSize; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use common_recordbatch::RecordBatches; +use common_wal::options::WAL_OPTIONS_KEY; use datatypes::prelude::ConcreteDataType; use datatypes::schema::ColumnSchema; +use rstest::rstest; +use rstest_reuse::{self, apply}; use store_api::metadata::ColumnMetadata; use store_api::region_request::{RegionCreateRequest, RegionOpenRequest, RegionPutRequest}; use store_api::storage::RegionId; @@ -32,7 +35,9 @@ use super::*; use crate::region::version::VersionControlData; use crate::test_util::{ build_delete_rows_for_key, build_rows, build_rows_for_key, delete_rows, delete_rows_schema, - flush_region, put_rows, reopen_region, rows_schema, CreateRequestBuilder, TestEnv, + flush_region, kafka_log_store_factory, multiple_log_store_factories, + prepare_test_for_kafka_log_store, put_rows, raft_engine_log_store_factory, reopen_region, + rows_schema, CreateRequestBuilder, LogStoreFactory, TestEnv, }; #[tokio::test] @@ -83,14 +88,24 @@ async fn test_write_to_region() { put_rows(&engine, region_id, rows).await; } -#[tokio::test] -async fn test_region_replay() { +#[apply(multiple_log_store_factories)] + +async fn test_region_replay(factory: Option) { + use common_wal::options::{KafkaWalOptions, WalOptions}; + common_telemetry::init_default_ut_logging(); - let mut env = TestEnv::with_prefix("region-replay"); + let Some(factory) = factory else { + return; + }; + let mut env = TestEnv::with_prefix("region-replay").with_log_store_factory(factory.clone()); let engine = env.create_engine(MitoConfig::default()).await; let region_id = RegionId::new(1, 1); - let request = CreateRequestBuilder::new().build(); + let topic = prepare_test_for_kafka_log_store(&factory).await; + let request = CreateRequestBuilder::new() + .kafka_topic(topic.clone()) + .build(); + let region_dir = request.region_dir.clone(); let column_schemas = rows_schema(&request); @@ -113,13 +128,24 @@ async fn test_region_replay() { let engine = env.reopen_engine(engine, MitoConfig::default()).await; + let mut options = HashMap::new(); + if let Some(topic) = &topic { + options.insert( + WAL_OPTIONS_KEY.to_string(), + serde_json::to_string(&WalOptions::Kafka(KafkaWalOptions { + topic: topic.to_string(), + })) + .unwrap(), + ); + }; + let result = engine .handle_request( region_id, RegionRequest::Open(RegionOpenRequest { engine: String::new(), region_dir, - options: HashMap::default(), + options, skip_wal_replay: false, }), ) diff --git a/src/mito2/src/engine/flush_test.rs b/src/mito2/src/engine/flush_test.rs index 89d44dc76129..52fb46dfab6a 100644 --- a/src/mito2/src/engine/flush_test.rs +++ b/src/mito2/src/engine/flush_test.rs @@ -21,6 +21,9 @@ use std::time::Duration; use api::v1::Rows; use common_recordbatch::RecordBatches; use common_time::util::current_time_millis; +use common_wal::options::WAL_OPTIONS_KEY; +use rstest::rstest; +use rstest_reuse::{self, apply}; use store_api::region_engine::RegionEngine; use store_api::region_request::RegionRequest; use store_api::storage::{RegionId, ScanRequest}; @@ -28,8 +31,10 @@ use store_api::storage::{RegionId, ScanRequest}; use crate::config::MitoConfig; use crate::engine::listener::{FlushListener, StallListener}; use crate::test_util::{ - build_rows, build_rows_for_key, flush_region, put_rows, reopen_region, rows_schema, - CreateRequestBuilder, MockWriteBufferManager, TestEnv, + build_rows, build_rows_for_key, flush_region, kafka_log_store_factory, + multiple_log_store_factories, prepare_test_for_kafka_log_store, put_rows, + raft_engine_log_store_factory, reopen_region, rows_schema, CreateRequestBuilder, + LogStoreFactory, MockWriteBufferManager, TestEnv, }; use crate::time_provider::TimeProvider; use crate::worker::MAX_INITIAL_CHECK_DELAY_SECS; @@ -231,13 +236,25 @@ async fn test_flush_empty() { assert_eq!(expected, batches.pretty_print().unwrap()); } -#[tokio::test] -async fn test_flush_reopen_region() { - let mut env = TestEnv::new(); +#[apply(multiple_log_store_factories)] +async fn test_flush_reopen_region(factory: Option) { + use std::collections::HashMap; + + use common_wal::options::{KafkaWalOptions, WalOptions}; + + common_telemetry::init_default_ut_logging(); + let Some(factory) = factory else { + return; + }; + + let mut env = TestEnv::new().with_log_store_factory(factory.clone()); let engine = env.create_engine(MitoConfig::default()).await; let region_id = RegionId::new(1, 1); - let request = CreateRequestBuilder::new().build(); + let topic = prepare_test_for_kafka_log_store(&factory).await; + let request = CreateRequestBuilder::new() + .kafka_topic(topic.clone()) + .build(); let region_dir = request.region_dir.clone(); let column_schemas = rows_schema(&request); @@ -263,7 +280,17 @@ async fn test_flush_reopen_region() { }; check_region(); - reopen_region(&engine, region_id, region_dir, true, Default::default()).await; + let mut options = HashMap::new(); + if let Some(topic) = &topic { + options.insert( + WAL_OPTIONS_KEY.to_string(), + serde_json::to_string(&WalOptions::Kafka(KafkaWalOptions { + topic: topic.to_string(), + })) + .unwrap(), + ); + }; + reopen_region(&engine, region_id, region_dir, true, options).await; check_region(); // Puts again. diff --git a/src/mito2/src/test_util.rs b/src/mito2/src/test_util.rs index d7c671962c03..3ceff8a297b2 100644 --- a/src/mito2/src/test_util.rs +++ b/src/mito2/src/test_util.rs @@ -33,16 +33,23 @@ use api::v1::value::ValueData; use api::v1::{OpType, Row, Rows, SemanticType}; use common_base::readable_size::ReadableSize; use common_datasource::compression::CompressionType; +use common_telemetry::warn; use common_test_util::temp_dir::{create_temp_dir, TempDir}; +use common_wal::options::{KafkaWalOptions, WalOptions, WAL_OPTIONS_KEY}; use datatypes::arrow::array::{TimestampMillisecondArray, UInt64Array, UInt8Array}; use datatypes::prelude::ConcreteDataType; use datatypes::schema::ColumnSchema; +use log_store::kafka::log_store::KafkaLogStore; use log_store::raft_engine::log_store::RaftEngineLogStore; use log_store::test_util::log_store_util; use object_store::manager::{ObjectStoreManager, ObjectStoreManagerRef}; use object_store::services::Fs; use object_store::util::join_dir; use object_store::ObjectStore; +use rskafka::client::partition::{Compression, UnknownTopicHandling}; +use rskafka::client::{Client, ClientBuilder}; +use rskafka::record::Record; +use rstest_reuse::template; use store_api::metadata::{ColumnMetadata, RegionMetadataRef}; use store_api::region_engine::RegionEngine; use store_api::region_request::{ @@ -75,11 +82,110 @@ pub(crate) fn new_noop_file_purger() -> FilePurgerRef { Arc::new(NoopFilePurger {}) } +pub(crate) fn raft_engine_log_store_factory() -> Option { + Some(LogStoreFactory::RaftEngine(RaftEngineLogStoreFactory)) +} + +pub(crate) fn kafka_log_store_factory() -> Option { + let _ = dotenv::dotenv(); + let Ok(broker_endpoints) = std::env::var("GT_KAFKA_ENDPOINTS") else { + warn!("env GT_KAFKA_ENDPOINTS not found"); + return None; + }; + + let broker_endpoints = broker_endpoints + .split(',') + .map(|s| s.trim().to_string()) + .collect::>(); + + Some(LogStoreFactory::Kafka(KafkaLogStoreFactory { + broker_endpoints, + })) +} + +#[template] +#[rstest] +#[case::with_raft_engine(raft_engine_log_store_factory())] +#[case::with_kafka(kafka_log_store_factory())] +#[tokio::test] +pub(crate) fn multiple_log_store_factories(#[case] factory: Option) {} + +#[derive(Clone)] +pub(crate) struct RaftEngineLogStoreFactory; + +impl RaftEngineLogStoreFactory { + async fn create_log_store>(&self, wal_path: P) -> RaftEngineLogStore { + log_store_util::create_tmp_local_file_log_store(wal_path).await + } +} + +pub(crate) async fn prepare_test_for_kafka_log_store(factory: &LogStoreFactory) -> Option { + if let LogStoreFactory::Kafka(factory) = factory { + let topic = uuid::Uuid::new_v4().to_string(); + let client = factory.client().await; + append_noop_record(&client, &topic).await; + + Some(topic) + } else { + None + } +} + +pub(crate) async fn append_noop_record(client: &Client, topic: &str) { + let partition_client = client + .partition_client(topic, 0, UnknownTopicHandling::Retry) + .await + .unwrap(); + + partition_client + .produce( + vec![Record { + key: None, + value: None, + timestamp: rskafka::chrono::Utc::now(), + headers: Default::default(), + }], + Compression::NoCompression, + ) + .await + .unwrap(); +} +#[derive(Clone)] +pub(crate) struct KafkaLogStoreFactory { + broker_endpoints: Vec, +} + +impl KafkaLogStoreFactory { + async fn create_log_store(&self) -> KafkaLogStore { + log_store_util::create_kafka_log_store(self.broker_endpoints.clone()).await + } + + pub(crate) async fn client(&self) -> Client { + ClientBuilder::new(self.broker_endpoints.clone()) + .build() + .await + .unwrap() + } +} + +#[derive(Clone)] +pub(crate) enum LogStoreFactory { + RaftEngine(RaftEngineLogStoreFactory), + Kafka(KafkaLogStoreFactory), +} + +#[derive(Clone)] +pub(crate) enum LogStoreImpl { + RaftEngine(Arc), + Kafka(Arc), +} + /// Env to test mito engine. pub struct TestEnv { /// Path to store data. data_home: TempDir, - logstore: Option>, + log_store: Option, + log_store_factory: LogStoreFactory, object_store_manager: Option, } @@ -94,7 +200,8 @@ impl TestEnv { pub fn new() -> TestEnv { TestEnv { data_home: create_temp_dir(""), - logstore: None, + log_store: None, + log_store_factory: LogStoreFactory::RaftEngine(RaftEngineLogStoreFactory), object_store_manager: None, } } @@ -103,7 +210,8 @@ impl TestEnv { pub fn with_prefix(prefix: &str) -> TestEnv { TestEnv { data_home: create_temp_dir(prefix), - logstore: None, + log_store: None, + log_store_factory: LogStoreFactory::RaftEngine(RaftEngineLogStoreFactory), object_store_manager: None, } } @@ -112,13 +220,16 @@ impl TestEnv { pub fn with_data_home(data_home: TempDir) -> TestEnv { TestEnv { data_home, - logstore: None, + log_store: None, + log_store_factory: LogStoreFactory::RaftEngine(RaftEngineLogStoreFactory), object_store_manager: None, } } - pub fn get_logstore(&self) -> Option> { - self.logstore.clone() + /// Overwrites the original `log_store_factory`. + pub(crate) fn with_log_store_factory(mut self, log_store_factory: LogStoreFactory) -> TestEnv { + self.log_store_factory = log_store_factory; + self } pub fn get_object_store(&self) -> Option { @@ -139,24 +250,41 @@ impl TestEnv { pub async fn create_engine(&mut self, config: MitoConfig) -> MitoEngine { let (log_store, object_store_manager) = self.create_log_and_object_store_manager().await; - let logstore = Arc::new(log_store); let object_store_manager = Arc::new(object_store_manager); - self.logstore = Some(logstore.clone()); + self.log_store = Some(log_store.clone()); self.object_store_manager = Some(object_store_manager.clone()); let data_home = self.data_home().display().to_string(); - MitoEngine::new(&data_home, config, logstore, object_store_manager) - .await - .unwrap() + + match log_store { + LogStoreImpl::RaftEngine(log_store) => { + MitoEngine::new(&data_home, config, log_store, object_store_manager) + .await + .unwrap() + } + LogStoreImpl::Kafka(log_store) => { + MitoEngine::new(&data_home, config, log_store, object_store_manager) + .await + .unwrap() + } + } } /// Creates a new engine with specific config and existing logstore and object store manager. pub async fn create_follower_engine(&mut self, config: MitoConfig) -> MitoEngine { - let logstore = self.logstore.as_ref().unwrap().clone(); let object_store_manager = self.object_store_manager.as_ref().unwrap().clone(); let data_home = self.data_home().display().to_string(); - MitoEngine::new(&data_home, config, logstore, object_store_manager) - .await - .unwrap() + match self.log_store.as_ref().unwrap().clone() { + LogStoreImpl::RaftEngine(log_store) => { + MitoEngine::new(&data_home, config, log_store, object_store_manager) + .await + .unwrap() + } + LogStoreImpl::Kafka(log_store) => { + MitoEngine::new(&data_home, config, log_store, object_store_manager) + .await + .unwrap() + } + } } /// Creates a new engine with specific config and manager/listener/purge_scheduler under this env. @@ -168,24 +296,36 @@ impl TestEnv { ) -> MitoEngine { let (log_store, object_store_manager) = self.create_log_and_object_store_manager().await; - let logstore = Arc::new(log_store); let object_store_manager = Arc::new(object_store_manager); - self.logstore = Some(logstore.clone()); + self.log_store = Some(log_store.clone()); self.object_store_manager = Some(object_store_manager.clone()); let data_home = self.data_home().display().to_string(); - MitoEngine::new_for_test( - &data_home, - config, - logstore, - object_store_manager, - manager, - listener, - Arc::new(StdTimeProvider), - ) - .await - .unwrap() + match log_store { + LogStoreImpl::RaftEngine(log_store) => MitoEngine::new_for_test( + &data_home, + config, + log_store, + object_store_manager, + manager, + listener, + Arc::new(StdTimeProvider), + ) + .await + .unwrap(), + LogStoreImpl::Kafka(log_store) => MitoEngine::new_for_test( + &data_home, + config, + log_store, + object_store_manager, + manager, + listener, + Arc::new(StdTimeProvider), + ) + .await + .unwrap(), + } } pub async fn create_engine_with_multiple_object_stores( @@ -195,7 +335,8 @@ impl TestEnv { listener: Option, custom_storage_names: &[&str], ) -> MitoEngine { - let (logstore, mut object_store_manager) = self.create_log_and_object_store_manager().await; + let (log_store, mut object_store_manager) = + self.create_log_and_object_store_manager().await; for storage_name in custom_storage_names { let data_path = self .data_home @@ -210,23 +351,35 @@ impl TestEnv { let object_store = ObjectStore::new(builder).unwrap().finish(); object_store_manager.add(storage_name, object_store); } - let logstore = Arc::new(logstore); let object_store_manager = Arc::new(object_store_manager); - self.logstore = Some(logstore.clone()); + self.log_store = Some(log_store.clone()); self.object_store_manager = Some(object_store_manager.clone()); let data_home = self.data_home().display().to_string(); - MitoEngine::new_for_test( - &data_home, - config, - logstore, - object_store_manager, - manager, - listener, - Arc::new(StdTimeProvider), - ) - .await - .unwrap() + match log_store { + LogStoreImpl::RaftEngine(log_store) => MitoEngine::new_for_test( + &data_home, + config, + log_store, + object_store_manager, + manager, + listener, + Arc::new(StdTimeProvider), + ) + .await + .unwrap(), + LogStoreImpl::Kafka(log_store) => MitoEngine::new_for_test( + &data_home, + config, + log_store, + object_store_manager, + manager, + listener, + Arc::new(StdTimeProvider), + ) + .await + .unwrap(), + } } /// Creates a new engine with specific config and manager/listener/time provider under this env. @@ -239,50 +392,82 @@ impl TestEnv { ) -> MitoEngine { let (log_store, object_store_manager) = self.create_log_and_object_store_manager().await; - let logstore = Arc::new(log_store); let object_store_manager = Arc::new(object_store_manager); - self.logstore = Some(logstore.clone()); + self.log_store = Some(log_store.clone()); self.object_store_manager = Some(object_store_manager.clone()); let data_home = self.data_home().display().to_string(); - MitoEngine::new_for_test( - &data_home, - config, - logstore, - object_store_manager, - manager, - listener, - time_provider.clone(), - ) - .await - .unwrap() + match log_store { + LogStoreImpl::RaftEngine(log_store) => MitoEngine::new_for_test( + &data_home, + config, + log_store, + object_store_manager, + manager, + listener, + time_provider.clone(), + ) + .await + .unwrap(), + LogStoreImpl::Kafka(log_store) => MitoEngine::new_for_test( + &data_home, + config, + log_store, + object_store_manager, + manager, + listener, + time_provider.clone(), + ) + .await + .unwrap(), + } } /// Reopen the engine. pub async fn reopen_engine(&mut self, engine: MitoEngine, config: MitoConfig) -> MitoEngine { engine.stop().await.unwrap(); - MitoEngine::new( - &self.data_home().display().to_string(), - config, - self.logstore.clone().unwrap(), - self.object_store_manager.clone().unwrap(), - ) - .await - .unwrap() + match self.log_store.as_ref().unwrap().clone() { + LogStoreImpl::RaftEngine(log_store) => MitoEngine::new( + &self.data_home().display().to_string(), + config, + log_store, + self.object_store_manager.clone().unwrap(), + ) + .await + .unwrap(), + LogStoreImpl::Kafka(log_store) => MitoEngine::new( + &self.data_home().display().to_string(), + config, + log_store, + self.object_store_manager.clone().unwrap(), + ) + .await + .unwrap(), + } } /// Open the engine. pub async fn open_engine(&mut self, config: MitoConfig) -> MitoEngine { - MitoEngine::new( - &self.data_home().display().to_string(), - config, - self.logstore.clone().unwrap(), - self.object_store_manager.clone().unwrap(), - ) - .await - .unwrap() + match self.log_store.as_ref().unwrap().clone() { + LogStoreImpl::RaftEngine(log_store) => MitoEngine::new( + &self.data_home().display().to_string(), + config, + log_store, + self.object_store_manager.clone().unwrap(), + ) + .await + .unwrap(), + LogStoreImpl::Kafka(log_store) => MitoEngine::new( + &self.data_home().display().to_string(), + config, + log_store, + self.object_store_manager.clone().unwrap(), + ) + .await + .unwrap(), + } } /// Only initializes the object store manager, returns the default object store. @@ -297,25 +482,44 @@ impl TestEnv { let data_home = self.data_home().display().to_string(); config.sanitize(&data_home).unwrap(); - WorkerGroup::start( - Arc::new(config), - Arc::new(log_store), - Arc::new(object_store_manager), - ) - .await - .unwrap() + + match log_store { + LogStoreImpl::RaftEngine(log_store) => { + WorkerGroup::start(Arc::new(config), log_store, Arc::new(object_store_manager)) + .await + .unwrap() + } + LogStoreImpl::Kafka(log_store) => { + WorkerGroup::start(Arc::new(config), log_store, Arc::new(object_store_manager)) + .await + .unwrap() + } + } } /// Returns the log store and object store manager. - async fn create_log_and_object_store_manager( - &self, - ) -> (RaftEngineLogStore, ObjectStoreManager) { + async fn create_log_and_object_store_manager(&self) -> (LogStoreImpl, ObjectStoreManager) { let data_home = self.data_home.path(); let wal_path = data_home.join("wal"); - let log_store = log_store_util::create_tmp_local_file_log_store(&wal_path).await; - let object_store_manager = self.create_object_store_manager(); - (log_store, object_store_manager) + + match &self.log_store_factory { + LogStoreFactory::RaftEngine(factory) => { + let log_store = factory.create_log_store(wal_path).await; + ( + LogStoreImpl::RaftEngine(Arc::new(log_store)), + object_store_manager, + ) + } + LogStoreFactory::Kafka(factory) => { + let log_store = factory.create_log_store().await; + + ( + LogStoreImpl::Kafka(Arc::new(log_store)), + object_store_manager, + ) + } + } } fn create_object_store_manager(&self) -> ObjectStoreManager { @@ -399,6 +603,8 @@ pub struct CreateRequestBuilder { all_not_null: bool, engine: String, ts_type: ConcreteDataType, + /// kafka topic name + kafka_topic: Option, } impl Default for CreateRequestBuilder { @@ -412,6 +618,7 @@ impl Default for CreateRequestBuilder { all_not_null: false, engine: MITO_ENGINE_NAME.to_string(), ts_type: ConcreteDataType::timestamp_millisecond_datatype(), + kafka_topic: None, } } } @@ -464,6 +671,12 @@ impl CreateRequestBuilder { self } + #[must_use] + pub fn kafka_topic(mut self, topic: Option) -> Self { + self.kafka_topic = topic; + self + } + pub fn build(&self) -> RegionCreateRequest { let mut column_id = 0; let mut column_metadatas = Vec::with_capacity(self.tag_num + self.field_num + 1); @@ -504,12 +717,21 @@ impl CreateRequestBuilder { semantic_type: SemanticType::Timestamp, column_id, }); - + let mut options = self.options.clone(); + if let Some(topic) = &self.kafka_topic { + let wal_options = WalOptions::Kafka(KafkaWalOptions { + topic: topic.to_string(), + }); + options.insert( + WAL_OPTIONS_KEY.to_string(), + serde_json::to_string(&wal_options).unwrap(), + ); + } RegionCreateRequest { engine: self.engine.to_string(), column_metadatas, primary_key: self.primary_key.clone().unwrap_or(primary_key), - options: self.options.clone(), + options, region_dir: self.region_dir.clone(), } } diff --git a/tests-integration/Cargo.toml b/tests-integration/Cargo.toml index 2473cfc320ad..887f04a3b218 100644 --- a/tests-integration/Cargo.toml +++ b/tests-integration/Cargo.toml @@ -53,8 +53,8 @@ object-store.workspace = true operator.workspace = true prost.workspace = true query.workspace = true -rstest = "0.17" -rstest_reuse = "0.5" +rstest.workspace = true +rstest_reuse.workspace = true serde_json.workspace = true servers = { workspace = true, features = ["testing"] } session.workspace = true diff --git a/tests-integration/src/lib.rs b/tests-integration/src/lib.rs index d3e700151345..5def9351d0c9 100644 --- a/tests-integration/src/lib.rs +++ b/tests-integration/src/lib.rs @@ -26,8 +26,3 @@ pub mod test_util; pub mod standalone; #[cfg(test)] mod tests; - -#[cfg(test)] -// allowed because https://docs.rs/rstest_reuse/0.5.0/rstest_reuse/#use-rstest_reuse-at-the-top-of-your-crate -#[allow(clippy::single_component_path_imports)] -use rstest_reuse; From a80059b47f26b37925d6b44108be3fe7c7156260 Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Tue, 4 Jun 2024 16:20:29 +0800 Subject: [PATCH 11/16] fix: recover memtable options when opening physical regions (#4102) * fix: recover memtable options when opening physical regions * chore: fmt * chore: merge data region options --- src/metric-engine/src/engine/create.rs | 11 +++-------- src/metric-engine/src/engine/open.rs | 4 ++-- src/metric-engine/src/engine/options.rs | 11 ++++------- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/src/metric-engine/src/engine/create.rs b/src/metric-engine/src/engine/create.rs index c71375299c38..dad22c72f9ff 100644 --- a/src/metric-engine/src/engine/create.rs +++ b/src/metric-engine/src/engine/create.rs @@ -38,9 +38,7 @@ use store_api::region_request::{AffectedRows, RegionCreateRequest, RegionRequest use store_api::storage::consts::ReservedColumnId; use store_api::storage::RegionId; -use crate::engine::options::{ - set_index_options_for_data_region, set_memtable_options_for_data_region, -}; +use crate::engine::options::set_data_region_options; use crate::engine::MetricEngineInner; use crate::error::{ AddingFieldColumnSnafu, ColumnNotFoundSnafu, ColumnTypeMismatchSnafu, @@ -478,11 +476,8 @@ impl MetricEngineInner { data_region_request.column_metadatas.push(tsid_col); data_region_request.primary_key = primary_key; - // set index options - set_index_options_for_data_region(&mut data_region_request.options); - - // Set memtable options. - set_memtable_options_for_data_region(&mut data_region_request.options); + // set data region options + set_data_region_options(&mut data_region_request.options); data_region_request } diff --git a/src/metric-engine/src/engine/open.rs b/src/metric-engine/src/engine/open.rs index 952c923487bf..c42e0376562c 100644 --- a/src/metric-engine/src/engine/open.rs +++ b/src/metric-engine/src/engine/open.rs @@ -26,7 +26,7 @@ use store_api::region_request::{AffectedRows, RegionOpenRequest, RegionRequest}; use store_api::storage::RegionId; use super::MetricEngineInner; -use crate::engine::options::set_index_options_for_data_region; +use crate::engine::options::set_data_region_options; use crate::error::{OpenMitoRegionSnafu, Result}; use crate::metrics::{LOGICAL_REGION_COUNT, PHYSICAL_REGION_COUNT}; use crate::utils; @@ -80,7 +80,7 @@ impl MetricEngineInner { }; let mut data_region_options = request.options; - set_index_options_for_data_region(&mut data_region_options); + set_data_region_options(&mut data_region_options); let open_data_region_request = RegionOpenRequest { region_dir: data_region_dir, options: data_region_options, diff --git a/src/metric-engine/src/engine/options.rs b/src/metric-engine/src/engine/options.rs index 56981329db29..f22f39271c2e 100644 --- a/src/metric-engine/src/engine/options.rs +++ b/src/metric-engine/src/engine/options.rs @@ -30,20 +30,17 @@ const IGNORE_COLUMN_IDS_FOR_DATA_REGION: [ColumnId; 1] = [ReservedColumnId::tsid /// value and appropriately increasing the size of the index, it results in an improved indexing effect. const SEG_ROW_COUNT_FOR_DATA_REGION: u32 = 256; -/// Set the index options for the data region. -pub fn set_index_options_for_data_region(options: &mut HashMap) { +/// Sets data region specific options. +pub fn set_data_region_options(options: &mut HashMap) { + // Set the index options for the data region. options.insert( "index.inverted_index.ignore_column_ids".to_string(), IGNORE_COLUMN_IDS_FOR_DATA_REGION.iter().join(","), ); - options.insert( "index.inverted_index.segment_row_count".to_string(), SEG_ROW_COUNT_FOR_DATA_REGION.to_string(), ); -} - -/// Set memtable options for the data region. -pub fn set_memtable_options_for_data_region(options: &mut HashMap) { + // Set memtable options for the data region. options.insert("memtable.type".to_string(), "partition_tree".to_string()); } From a6269397c8924dbf9914ce13d81eded74a8c8f68 Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Tue, 4 Jun 2024 16:43:33 +0800 Subject: [PATCH 12/16] fix: fix EntityTooSmall issue (#4100) * fix: fix EntityTooSmall issue * chore(ci): add minio to coverage * tests: add test for parquet writer * chore: move tests to `common-datasource` crate --- .github/workflows/develop.yml | 8 +++ Cargo.lock | 5 ++ src/common/datasource/Cargo.toml | 5 ++ src/common/datasource/src/file_format.rs | 2 + .../datasource/src/file_format/parquet.rs | 72 +++++++++++++++++++ src/common/datasource/src/lib.rs | 5 ++ src/mito2/Cargo.toml | 1 + .../minio/docker-compose-standalone.yml | 18 +++++ 8 files changed, 116 insertions(+) create mode 100644 tests-integration/fixtures/minio/docker-compose-standalone.yml diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index b397a0fd6046..deecdc454a95 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -518,6 +518,9 @@ jobs: - name: Setup kafka server working-directory: tests-integration/fixtures/kafka run: docker compose -f docker-compose-standalone.yml up -d --wait + - name: Setup minio + working-directory: tests-integration/fixtures/minio + run: docker compose -f docker-compose-standalone.yml up -d --wait - name: Run nextest cases run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F pyo3_backend -F dashboard env: @@ -528,6 +531,11 @@ jobs: GT_S3_ACCESS_KEY_ID: ${{ secrets.AWS_CI_TEST_ACCESS_KEY_ID }} GT_S3_ACCESS_KEY: ${{ secrets.AWS_CI_TEST_SECRET_ACCESS_KEY }} GT_S3_REGION: ${{ vars.AWS_CI_TEST_BUCKET_REGION }} + GT_MINIO_BUCKET: greptime + GT_MINIO_ACCESS_KEY_ID: superpower_ci_user + GT_MINIO_ACCESS_KEY: superpower_password + GT_MINIO_REGION: us-west-2 + GT_MINIO_ENDPOINT_URL: http://127.0.0.1:9000 GT_ETCD_ENDPOINTS: http://127.0.0.1:2379 GT_KAFKA_ENDPOINTS: 127.0.0.1:9092 UNITTEST_LOG_DIR: "__unittest_logs" diff --git a/Cargo.lock b/Cargo.lock index d722b1e9f2ca..44ad55172287 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1752,20 +1752,24 @@ dependencies = [ "async-compression 0.3.15", "async-trait", "bytes", + "common-base", "common-error", "common-macro", "common-recordbatch", "common-runtime", + "common-telemetry", "common-test-util", "datafusion 38.0.0", "datatypes", "derive_builder 0.12.0", + "dotenv", "futures", "lazy_static", "object-store", "orc-rust", "parquet", "paste", + "rand", "regex", "serde", "snafu 0.8.3", @@ -1773,6 +1777,7 @@ dependencies = [ "tokio", "tokio-util", "url", + "uuid", ] [[package]] diff --git a/src/common/datasource/Cargo.toml b/src/common/datasource/Cargo.toml index ece0edd9fe0d..e4bdec2c70dc 100644 --- a/src/common/datasource/Cargo.toml +++ b/src/common/datasource/Cargo.toml @@ -20,6 +20,7 @@ async-compression = { version = "0.3", features = [ ] } async-trait.workspace = true bytes.workspace = true +common-base.workspace = true common-error.workspace = true common-macro.workspace = true common-recordbatch.workspace = true @@ -33,6 +34,7 @@ object-store.workspace = true orc-rust = { git = "https://github.com/datafusion-contrib/datafusion-orc.git", rev = "502217315726314c4008808fe169764529640599" } parquet.workspace = true paste = "1.0" +rand.workspace = true regex = "1.7" serde.workspace = true snafu.workspace = true @@ -42,4 +44,7 @@ tokio-util.workspace = true url = "2.3" [dev-dependencies] +common-telemetry.workspace = true common-test-util.workspace = true +dotenv.workspace = true +uuid.workspace = true diff --git a/src/common/datasource/src/file_format.rs b/src/common/datasource/src/file_format.rs index 5bb9258ad3d0..c555f763b59b 100644 --- a/src/common/datasource/src/file_format.rs +++ b/src/common/datasource/src/file_format.rs @@ -46,6 +46,7 @@ use crate::buffered_writer::{DfRecordBatchEncoder, LazyBufferedWriter}; use crate::compression::CompressionType; use crate::error::{self, Result}; use crate::share_buffer::SharedBuffer; +use crate::DEFAULT_WRITE_BUFFER_SIZE; pub const FORMAT_COMPRESSION_TYPE: &str = "compression_type"; pub const FORMAT_DELIMITER: &str = "delimiter"; @@ -204,6 +205,7 @@ pub async fn stream_to_file T>( store .writer_with(&path) .concurrent(concurrency) + .chunk(DEFAULT_WRITE_BUFFER_SIZE.as_bytes() as usize) .await .map(|v| v.into_futures_async_write().compat_write()) .context(error::WriteObjectSnafu { path }) diff --git a/src/common/datasource/src/file_format/parquet.rs b/src/common/datasource/src/file_format/parquet.rs index 2e887ac2f7c3..f5125757b956 100644 --- a/src/common/datasource/src/file_format/parquet.rs +++ b/src/common/datasource/src/file_format/parquet.rs @@ -39,6 +39,7 @@ use crate::buffered_writer::{ArrowWriterCloser, DfRecordBatchEncoder, LazyBuffer use crate::error::{self, Result}; use crate::file_format::FileFormat; use crate::share_buffer::SharedBuffer; +use crate::DEFAULT_WRITE_BUFFER_SIZE; #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] pub struct ParquetFormat {} @@ -197,6 +198,7 @@ impl BufferedWriter { store .writer_with(&path) .concurrent(concurrency) + .chunk(DEFAULT_WRITE_BUFFER_SIZE.as_bytes() as usize) .await .map(|v| v.into_futures_async_write().compat_write()) .context(error::WriteObjectSnafu { path }) @@ -276,9 +278,19 @@ pub async fn stream_to_parquet( #[cfg(test)] mod tests { + use std::env; + use std::sync::Arc; + + use common_telemetry::warn; use common_test_util::find_workspace_path; + use datatypes::arrow::array::{ArrayRef, Int64Array, RecordBatch}; + use datatypes::arrow::datatypes::{DataType, Field, Schema}; + use object_store::services::S3; + use object_store::ObjectStore; + use rand::{thread_rng, Rng}; use super::*; + use crate::file_format::parquet::BufferedWriter; use crate::test_util::{format_schema, test_store}; fn test_data_root() -> String { @@ -296,4 +308,64 @@ mod tests { assert_eq!(vec!["num: Int64: NULL", "str: Utf8: NULL"], formatted); } + + #[tokio::test] + async fn test_parquet_writer() { + common_telemetry::init_default_ut_logging(); + let _ = dotenv::dotenv(); + let Ok(bucket) = env::var("GT_MINIO_BUCKET") else { + warn!("ignoring test parquet writer"); + return; + }; + + let mut builder = S3::default(); + let _ = builder + .root(&uuid::Uuid::new_v4().to_string()) + .access_key_id(&env::var("GT_MINIO_ACCESS_KEY_ID").unwrap()) + .secret_access_key(&env::var("GT_MINIO_ACCESS_KEY").unwrap()) + .bucket(&bucket) + .region(&env::var("GT_MINIO_REGION").unwrap()) + .endpoint(&env::var("GT_MINIO_ENDPOINT_URL").unwrap()); + + let object_store = ObjectStore::new(builder).unwrap().finish(); + let file_path = uuid::Uuid::new_v4().to_string(); + let fields = vec![ + Field::new("field1", DataType::Int64, true), + Field::new("field0", DataType::Int64, true), + ]; + let arrow_schema = Arc::new(Schema::new(fields)); + let mut buffered_writer = BufferedWriter::try_new( + file_path.clone(), + object_store.clone(), + arrow_schema.clone(), + None, + // Sets a small value. + 128, + 8, + ) + .await + .unwrap(); + let rows = 200000; + let generator = || { + let columns: Vec = vec![ + Arc::new(Int64Array::from( + (0..rows) + .map(|_| thread_rng().gen::()) + .collect::>(), + )), + Arc::new(Int64Array::from( + (0..rows) + .map(|_| thread_rng().gen::()) + .collect::>(), + )), + ]; + RecordBatch::try_new(arrow_schema.clone(), columns).unwrap() + }; + let batch = generator(); + // Writes about ~30Mi + for _ in 0..10 { + buffered_writer.write(&batch).await.unwrap(); + } + buffered_writer.close().await.unwrap(); + } } diff --git a/src/common/datasource/src/lib.rs b/src/common/datasource/src/lib.rs index 8cb8756e06f8..5d24b1cdf49d 100644 --- a/src/common/datasource/src/lib.rs +++ b/src/common/datasource/src/lib.rs @@ -27,3 +27,8 @@ pub mod test_util; #[cfg(test)] pub mod tests; pub mod util; + +use common_base::readable_size::ReadableSize; + +/// Default write buffer size, it should be greater than the default minimum upload part of S3 (5mb). +pub const DEFAULT_WRITE_BUFFER_SIZE: ReadableSize = ReadableSize::mb(8); diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml index f3a747dcd02f..3994ebb43985 100644 --- a/src/mito2/Cargo.toml +++ b/src/mito2/Cargo.toml @@ -75,6 +75,7 @@ uuid.workspace = true common-procedure-test.workspace = true common-test-util.workspace = true criterion = "0.4" +dotenv.workspace = true log-store.workspace = true object-store = { workspace = true, features = ["services-memory"] } rskafka.workspace = true diff --git a/tests-integration/fixtures/minio/docker-compose-standalone.yml b/tests-integration/fixtures/minio/docker-compose-standalone.yml new file mode 100644 index 000000000000..139cb916a950 --- /dev/null +++ b/tests-integration/fixtures/minio/docker-compose-standalone.yml @@ -0,0 +1,18 @@ +version: '3.8' +services: + minio: + image: bitnami/minio:2024 + ports: + - '9000:9000' + - '9001:9001' + environment: + - MINIO_ROOT_USER=superpower_ci_user + - MINIO_ROOT_PASSWORD=superpower_password + - MINIO_DEFAULT_BUCKETS=greptime + - BITNAMI_DEBUG=true + volumes: + - 'minio_data:/bitnami/minio/data' + +volumes: + minio_data: + driver: local From 0a07130931d3a27d71e1c962190edd75832b103b Mon Sep 17 00:00:00 2001 From: discord9 <55937128+discord9@users.noreply.github.com> Date: Tue, 4 Jun 2024 17:07:13 +0800 Subject: [PATCH 13/16] fix(flow): mfp operator missing rows (#4084) * fix: mfp missing rows if run twice in same tick * tests: run mfp for multiple times * refactor: make mfp less hacky * feat: make channel larger * chore: typos --- src/flow/src/adapter.rs | 2 - src/flow/src/compute/render/map.rs | 60 +++++++++++++++++++++++++++++- src/flow/src/repr.rs | 3 +- 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/src/flow/src/adapter.rs b/src/flow/src/adapter.rs index 2e66f3850be8..31f4fadf0326 100644 --- a/src/flow/src/adapter.rs +++ b/src/flow/src/adapter.rs @@ -64,8 +64,6 @@ mod table_source; use error::Error; -pub const PER_REQ_MAX_ROW_CNT: usize = 8192; - // TODO: replace this with `GREPTIME_TIMESTAMP` before v0.9 pub const AUTO_CREATED_PLACEHOLDER_TS_COL: &str = "__ts_placeholder"; diff --git a/src/flow/src/compute/render/map.rs b/src/flow/src/compute/render/map.rs index 50bd48f5fb70..91cb37c6cf93 100644 --- a/src/flow/src/compute/render/map.rs +++ b/src/flow/src/compute/render/map.rs @@ -113,9 +113,21 @@ fn mfp_subgraph( scheduler: &Scheduler, send: &PortCtx, ) { + // all updates that should be send immediately + let mut output_now = vec![]; let run_mfp = || { - let all_updates = eval_mfp_core(input, mfp_plan, now, err_collector); - arrange.write().apply_updates(now, all_updates)?; + let mut all_updates = eval_mfp_core(input, mfp_plan, now, err_collector); + all_updates.retain(|(kv, ts, d)| { + if *ts > now { + true + } else { + output_now.push((kv.clone(), *ts, *d)); + false + } + }); + let future_updates = all_updates; + + arrange.write().apply_updates(now, future_updates)?; Ok(()) }; err_collector.run(run_mfp); @@ -130,13 +142,19 @@ fn mfp_subgraph( std::ops::Bound::Excluded(from), std::ops::Bound::Included(now), ); + + // find all updates that need to be send from arrangement let output_kv = arrange.read().get_updates_in_range(range); + // the output is expected to be key -> empty val let output = output_kv .into_iter() + .chain(output_now) // chain previous immediately send updates .map(|((key, _v), ts, diff)| (key, ts, diff)) .collect_vec(); + // send output send.give(output); + let run_compaction = || { arrange.write().compact_to(now)?; Ok(()) @@ -305,4 +323,42 @@ mod test { ]); run_and_check(&mut state, &mut df, 1..5, expected, output); } + + /// test if mfp operator can run multiple times within same tick + #[test] + fn test_render_mfp_multiple_times() { + let mut df = Hydroflow::new(); + let mut state = DataflowState::default(); + let mut ctx = harness_test_ctx(&mut df, &mut state); + + let (sender, recv) = tokio::sync::broadcast::channel(1000); + let collection = ctx.render_source(recv).unwrap(); + ctx.insert_global(GlobalId::User(1), collection); + let input_plan = Plan::Get { + id: expr::Id::Global(GlobalId::User(1)), + }; + let typ = RelationType::new(vec![ColumnType::new_nullable( + ConcreteDataType::int64_datatype(), + )]); + // filter: col(0)>1 + let mfp = MapFilterProject::new(1) + .filter(vec![ScalarExpr::Column(0).call_binary( + ScalarExpr::literal(1.into(), ConcreteDataType::int32_datatype()), + BinaryFunc::Gt, + )]) + .unwrap(); + let bundle = ctx + .render_mfp(Box::new(input_plan.with_types(typ)), mfp) + .unwrap(); + + let output = get_output_handle(&mut ctx, bundle); + drop(ctx); + sender.send((Row::new(vec![2.into()]), 0, 1)).unwrap(); + state.run_available_with_schedule(&mut df); + assert_eq!(output.borrow().len(), 1); + output.borrow_mut().clear(); + sender.send((Row::new(vec![3.into()]), 0, 1)).unwrap(); + state.run_available_with_schedule(&mut df); + assert_eq!(output.borrow().len(), 1); + } } diff --git a/src/flow/src/repr.rs b/src/flow/src/repr.rs index e918044c0d91..e28689be4008 100644 --- a/src/flow/src/repr.rs +++ b/src/flow/src/repr.rs @@ -53,7 +53,8 @@ pub type KeyValDiffRow = ((Row, Row), Timestamp, Diff); /// broadcast channel capacity, can be important to memory consumption, since this influence how many /// updates can be buffered in memory in the entire dataflow -pub const BROADCAST_CAP: usize = 8192; +/// TODO(discord9): add config for this, so cpu&mem usage can be balanced and configured by this +pub const BROADCAST_CAP: usize = 65535; /// Convert a value that is or can be converted to Datetime to internal timestamp /// From c152472383ab4f3a6058a1eaeac51445aa55f7e6 Mon Sep 17 00:00:00 2001 From: shuiyisong Date: Tue, 4 Jun 2024 17:27:02 +0800 Subject: [PATCH 14/16] refactor: move http event handler to a separate file --- src/frontend/src/instance/log_handler.rs | 25 ++-- src/frontend/src/pipeline.rs | 5 +- src/servers/src/http.rs | 5 +- src/servers/src/http/event.rs | 140 +++++++++++++++++++++++ src/servers/src/http/handler.rs | 112 +----------------- src/servers/src/query_handler.rs | 9 +- 6 files changed, 162 insertions(+), 134 deletions(-) create mode 100644 src/servers/src/http/event.rs diff --git a/src/frontend/src/instance/log_handler.rs b/src/frontend/src/instance/log_handler.rs index 35541314c2ef..dd34614b3ae9 100644 --- a/src/frontend/src/instance/log_handler.rs +++ b/src/frontend/src/instance/log_handler.rs @@ -18,7 +18,7 @@ use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq}; use client::Output; use common_error::ext::BoxedError; use pipeline::{GreptimeTransformer, Pipeline}; -use servers::error::{AuthSnafu, ExecuteGrpcRequestSnafu}; +use servers::error::{AuthSnafu, ExecuteGrpcRequestSnafu, Result as ServerResult}; use servers::query_handler::LogHandler; use session::context::QueryContextRef; use snafu::ResultExt; @@ -31,7 +31,7 @@ impl LogHandler for Instance { &self, log: RowInsertRequests, ctx: QueryContextRef, - ) -> servers::error::Result { + ) -> ServerResult { self.plugins .get::() .as_ref() @@ -43,9 +43,9 @@ impl LogHandler for Instance { async fn get_pipeline( &self, - query_ctx: QueryContextRef, name: &str, - ) -> servers::error::Result> { + query_ctx: QueryContextRef, + ) -> ServerResult> { self.pipeline_operator .get_pipeline(query_ctx, name) .await @@ -55,24 +55,17 @@ impl LogHandler for Instance { async fn insert_pipeline( &self, - query_ctx: QueryContextRef, name: &str, content_type: &str, pipeline: &str, - ) -> servers::error::Result<()> { + query_ctx: QueryContextRef, + ) -> ServerResult<()> { self.pipeline_operator - .insert_pipeline(query_ctx, name, content_type, pipeline) + .insert_pipeline(name, content_type, pipeline, query_ctx) .await - .map_err(BoxedError::new) - .context(servers::error::InsertPipelineSnafu { name })?; - Ok(()) } - async fn delete_pipeline( - &self, - _query_ctx: QueryContextRef, - _name: &str, - ) -> servers::error::Result<()> { + async fn delete_pipeline(&self, _name: &str, _query_ctx: QueryContextRef) -> ServerResult<()> { todo!("delete_pipeline") } } @@ -82,7 +75,7 @@ impl Instance { &self, log: RowInsertRequests, ctx: QueryContextRef, - ) -> servers::error::Result { + ) -> ServerResult { self.inserter .handle_log_inserts(log, ctx, self.statement_executor.as_ref()) .await diff --git a/src/frontend/src/pipeline.rs b/src/frontend/src/pipeline.rs index 3dd107f7dcac..92fc4efb460a 100644 --- a/src/frontend/src/pipeline.rs +++ b/src/frontend/src/pipeline.rs @@ -26,6 +26,7 @@ use operator::statement::StatementExecutorRef; use pipeline::table::{PipelineTable, PipelineTableRef}; use pipeline::{GreptimeTransformer, Pipeline}; use query::QueryEngineRef; +use servers::error::Result as ServerResult; use session::context::{QueryContext, QueryContextRef}; use snafu::{OptionExt, ResultExt}; use table::TableRef; @@ -210,11 +211,11 @@ impl PipelineOperator { pub async fn insert_pipeline( &self, - query_ctx: QueryContextRef, name: &str, content_type: &str, pipeline: &str, - ) -> servers::error::Result<()> { + query_ctx: QueryContextRef, + ) -> ServerResult<()> { self.create_pipeline_table_if_not_exists(query_ctx.current_catalog()) .await .map_err(|e| { diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index c54decff4160..d9d8b6ec2731 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -73,6 +73,7 @@ use crate::query_handler::{ use crate::server::Server; pub mod authorize; +pub mod event; pub mod handler; pub mod header; pub mod influxdb; @@ -711,8 +712,8 @@ impl HttpServer { fn route_log(log_handler: LogHandlerRef) -> Router { Router::new() - .route("/logs", routing::post(handler::log_ingester)) - .route("/pipelines", routing::post(handler::add_pipeline)) + .route("/logs", routing::post(event::log_ingester)) + .route("/pipelines", routing::post(event::add_pipeline)) .layer( ServiceBuilder::new() .layer(HandleErrorLayer::new(handle_error)) diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs new file mode 100644 index 000000000000..908160c17f39 --- /dev/null +++ b/src/servers/src/http/event.rs @@ -0,0 +1,140 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use api::v1::{RowInsertRequest, RowInsertRequests, Rows}; +use axum::extract::{Json, Query, State, TypedHeader}; +use axum::headers::ContentType; +use axum::Extension; +use common_telemetry::error; +use pipeline::Value as PipelineValue; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use session::context::QueryContextRef; +use snafu::{OptionExt, ResultExt}; + +use crate::error::{ + InsertLogSnafu, InvalidParameterSnafu, ParseJsonSnafu, Result, UnsupportedContentTypeSnafu, +}; +use crate::http::greptime_result_v1::GreptimedbV1Response; +use crate::http::HttpResponse; +use crate::query_handler::LogHandlerRef; + +#[derive(Debug, Default, Serialize, Deserialize, JsonSchema)] +pub struct LogIngesterQueryParams { + pub table_name: Option, + pub db: Option, + pub pipeline_name: Option, +} + +#[axum_macros::debug_handler] +pub async fn add_pipeline( + State(handler): State, + Extension(query_ctx): Extension, + Json(payload): Json, +) -> Result { + let name = payload["name"].as_str().context(InvalidParameterSnafu { + reason: "name is required in payload", + })?; + let pipeline = payload["pipeline"] + .as_str() + .context(InvalidParameterSnafu { + reason: "pipeline is required in payload", + })?; + + let content_type = "yaml"; + let result = handler + .insert_pipeline(name, content_type, pipeline, query_ctx) + .await; + + result.map(|_| "ok".to_string()).map_err(|e| { + error!(e; "failed to insert pipeline"); + e + }) +} + +#[axum_macros::debug_handler] +pub async fn log_ingester( + State(state): State, + Query(query_params): Query, + Extension(query_ctx): Extension, + TypedHeader(content_type): TypedHeader, + payload: String, +) -> Result { + let value; + // TODO (qtang): we should decide json or jsonl + if content_type == ContentType::json() { + value = serde_json::from_str(&payload).context(ParseJsonSnafu)?; + // TODO (qtang): we should decide which content type to support + // form_url_cncoded type is only placeholder + } else if content_type == ContentType::form_url_encoded() { + value = parse_space_separated_log(payload)?; + } else { + return UnsupportedContentTypeSnafu { content_type }.fail(); + } + log_ingester_inner(state, query_params, query_ctx, value) + .await + .or_else(|e| InsertLogSnafu { msg: e }.fail()) +} + +fn parse_space_separated_log(payload: String) -> Result { + // ToStructuredLogSnafu + let _log = payload.split_whitespace().collect::>(); + // TODO (qtang): implement this + todo!() +} + +async fn log_ingester_inner( + state: LogHandlerRef, + query_params: LogIngesterQueryParams, + query_ctx: QueryContextRef, + payload: Value, +) -> std::result::Result { + let pipeline_id = query_params + .pipeline_name + .ok_or("pipeline_name is required".to_string())?; + + let pipeline_data = PipelineValue::try_from(payload)?; + + let pipeline = state + .get_pipeline(&pipeline_id, query_ctx.clone()) + .await + .map_err(|e| e.to_string())?; + let transformed_data: Rows = pipeline.exec(pipeline_data)?; + + let table_name = query_params + .table_name + .ok_or("table_name is required".to_string())?; + + let insert_request = RowInsertRequest { + rows: Some(transformed_data), + table_name: table_name.clone(), + }; + let insert_requests = RowInsertRequests { + inserts: vec![insert_request], + }; + state + .insert_log(insert_requests, query_ctx) + .await + .map(|_| { + HttpResponse::GreptimedbV1(GreptimedbV1Response { + output: vec![], + execution_time_ms: 0, + resp_metrics: HashMap::new(), + }) + }) + .map_err(|e| e.to_string()) +} diff --git a/src/servers/src/http/handler.rs b/src/servers/src/http/handler.rs index fd9623864e3f..79f60639d272 100644 --- a/src/servers/src/http/handler.rs +++ b/src/servers/src/http/handler.rs @@ -17,9 +17,7 @@ use std::env; use std::time::Instant; use aide::transform::TransformOperation; -use api::v1::{RowInsertRequest, RowInsertRequests, Rows}; -use axum::extract::{Json, Query, State, TypedHeader}; -use axum::headers::ContentType; +use axum::extract::{Json, Query, State}; use axum::response::{IntoResponse, Response}; use axum::{Extension, Form}; use common_error::ext::ErrorExt; @@ -28,16 +26,13 @@ use common_plugins::GREPTIME_EXEC_WRITE_COST; use common_query::{Output, OutputData}; use common_recordbatch::util; use common_telemetry::tracing; -use pipeline::Value as PipelineValue; use query::parser::{PromQuery, DEFAULT_LOOKBACK_STRING}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use serde_json::Value; use session::context::QueryContextRef; -use snafu::ResultExt; use super::header::collect_plan_metrics; -use crate::error::{Error, InsertLogSnafu, ParseJsonSnafu, UnsupportedContentTypeSnafu}; use crate::http::arrow_result::ArrowResponse; use crate::http::csv_result::CsvResponse; use crate::http::error_result::ErrorResponse; @@ -50,7 +45,6 @@ use crate::http::{ }; use crate::metrics_handler::MetricsHandler; use crate::query_handler::sql::ServerSqlQueryHandlerRef; -use crate::query_handler::LogHandlerRef; #[derive(Debug, Default, Serialize, Deserialize, JsonSchema)] pub struct SqlQuery { @@ -71,110 +65,6 @@ pub struct SqlQuery { pub limit: Option, } -#[derive(Debug, Default, Serialize, Deserialize, JsonSchema)] -pub struct LogIngesterQueryParams { - pub table_name: Option, - pub db: Option, - pub pipeline_name: Option, -} - -fn parse_space_separated_log(payload: String) -> Result { - // ToStructuredLogSnafu - let _log = payload.split_whitespace().collect::>(); - // TODO (qtang): implement this - todo!() -} - -async fn log_ingester_inner( - state: LogHandlerRef, - query_params: LogIngesterQueryParams, - query_ctx: QueryContextRef, - payload: Value, -) -> Result { - let pipeline_id = query_params - .pipeline_name - .ok_or("pipeline_name is required".to_string())?; - - let pipeline_data = PipelineValue::try_from(payload)?; - - let pipeline = state - .get_pipeline(query_ctx.clone(), &pipeline_id) - .await - .map_err(|e| e.to_string())?; - let transformed_data: Rows = pipeline.exec(pipeline_data)?; - - let table_name = query_params - .table_name - .ok_or("table_name is required".to_string())?; - - let insert_request = RowInsertRequest { - rows: Some(transformed_data), - table_name: table_name.clone(), - }; - let insert_requests = RowInsertRequests { - inserts: vec![insert_request], - }; - state - .insert_log(insert_requests, query_ctx) - .await - .map(|_| { - HttpResponse::GreptimedbV1(GreptimedbV1Response { - output: vec![], - execution_time_ms: 0, - resp_metrics: HashMap::new(), - }) - }) - .map_err(|e| e.to_string()) -} - -/// handler to log ingester -#[axum_macros::debug_handler] -pub async fn log_ingester( - State(state): State, - Query(query_params): Query, - Extension(query_ctx): Extension, - TypedHeader(content_type): TypedHeader, - payload: String, -) -> Result { - let value; - // TODO (qtang): we should decide json or jsonl - if content_type == ContentType::json() { - value = serde_json::from_str(&payload).context(ParseJsonSnafu)?; - // TODO (qtang): we should decide which content type to support - // form_url_cncoded type is only placeholder - } else if content_type == ContentType::form_url_encoded() { - value = parse_space_separated_log(payload)?; - } else { - return UnsupportedContentTypeSnafu { content_type }.fail(); - } - log_ingester_inner(state, query_params, query_ctx, value) - .await - .or_else(|e| InsertLogSnafu { msg: e }.fail()) -} - -#[axum_macros::debug_handler] -pub async fn add_pipeline( - State(_state): State, - Query(_query_params): Query, - Extension(_query_ctx): Extension, - TypedHeader(_content_type): TypedHeader, - Json(paylod): Json, -) -> String { - let name = paylod["name"].as_str().unwrap(); - let pipeline = paylod["pipeline"].as_str().unwrap(); - let content_type = "yaml"; - let result = _state - .insert_pipeline(_query_ctx, name, content_type, pipeline) - .await; - match result { - Ok(_) => String::from("ok"), - Err(e) => { - common_telemetry::error!("failed to insert pipeline.{e:?}"); - e.to_string() - } - } -} - /// Handler to execute sql #[axum_macros::debug_handler] #[tracing::instrument(skip_all, fields(protocol = "http", request_type = "sql"))] diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs index c2ef268d494f..d6a280f4a685 100644 --- a/src/servers/src/query_handler.rs +++ b/src/servers/src/query_handler.rs @@ -124,17 +124,20 @@ pub trait OpenTelemetryProtocolHandler { #[async_trait] pub trait LogHandler { async fn insert_log(&self, log: RowInsertRequests, ctx: QueryContextRef) -> Result; + async fn get_pipeline( &self, - query_ctx: QueryContextRef, name: &str, + query_ctx: QueryContextRef, ) -> Result>; + async fn insert_pipeline( &self, - query_ctx: QueryContextRef, name: &str, content_type: &str, pipeline: &str, + query_ctx: QueryContextRef, ) -> Result<()>; - async fn delete_pipeline(&self, query_ctx: QueryContextRef, name: &str) -> Result<()>; + + async fn delete_pipeline(&self, name: &str, query_ctx: QueryContextRef) -> Result<()>; } From c0aed1d267f7f1d638bedc68456ed7ecc794c6bf Mon Sep 17 00:00:00 2001 From: LFC <990479+MichaelScofield@users.noreply.github.com> Date: Tue, 4 Jun 2024 18:03:33 +0800 Subject: [PATCH 15/16] feat: set global runtime size by config file (#4063) * set global runtime size * fix: resolve PR comments * fix: log the whole option * fix ci * debug ci * debug ci --------- Co-authored-by: Weny Xu --- .../setup-greptimedb-cluster/action.yml | 1 + .../setup-greptimedb-cluster/values.yaml | 18 ++ Cargo.lock | 5 +- config/config.md | 16 ++ config/datanode.example.toml | 9 + config/frontend.example.toml | 9 + config/metasrv.example.toml | 9 + config/standalone.example.toml | 9 + src/cmd/src/datanode.rs | 73 +++--- src/cmd/src/frontend.rs | 59 +++-- src/cmd/src/metasrv.rs | 56 +++-- src/cmd/src/options.rs | 22 ++ src/cmd/src/standalone.rs | 71 ++++-- src/cmd/tests/load_config_test.rs | 218 ++++++++++++++++++ src/common/config/src/config.rs | 7 +- src/common/runtime/Cargo.toml | 4 +- src/common/runtime/src/global.rs | 47 +++- src/common/runtime/src/lib.rs | 2 +- src/common/test-util/Cargo.toml | 2 +- src/common/test-util/src/recordbatch.rs | 23 ++ src/datanode/src/config.rs | 58 ++++- src/frontend/src/frontend.rs | 2 +- src/frontend/src/instance.rs | 2 +- src/frontend/src/server.rs | 4 +- src/frontend/src/service_config/datanode.rs | 2 +- src/meta-srv/src/metasrv.rs | 2 +- tests-integration/src/test_util.rs | 25 -- tests-integration/tests/grpc.rs | 6 +- 28 files changed, 600 insertions(+), 161 deletions(-) create mode 100644 .github/actions/setup-greptimedb-cluster/values.yaml create mode 100644 src/cmd/tests/load_config_test.rs diff --git a/.github/actions/setup-greptimedb-cluster/action.yml b/.github/actions/setup-greptimedb-cluster/action.yml index 93d8c569c95d..eaf0032c7715 100644 --- a/.github/actions/setup-greptimedb-cluster/action.yml +++ b/.github/actions/setup-greptimedb-cluster/action.yml @@ -57,6 +57,7 @@ runs: greptime/greptimedb-cluster \ --create-namespace \ -n my-greptimedb \ + --values ./.github/actions/setup-greptimedb-cluster/values.yaml \ --wait \ --wait-for-jobs - name: Wait for GreptimeDB diff --git a/.github/actions/setup-greptimedb-cluster/values.yaml b/.github/actions/setup-greptimedb-cluster/values.yaml new file mode 100644 index 000000000000..b7ac1eb86e17 --- /dev/null +++ b/.github/actions/setup-greptimedb-cluster/values.yaml @@ -0,0 +1,18 @@ +meta: + config: |- + [runtime] + read_rt_size = 8 + write_rt_size = 8 + bg_rt_size = 8 +datanode: + config: |- + [runtime] + read_rt_size = 8 + write_rt_size = 8 + bg_rt_size = 8 +frontend: + config: |- + [runtime] + read_rt_size = 8 + write_rt_size = 8 + bg_rt_size = 8 \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 44ad55172287..306bccd24f43 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2080,9 +2080,11 @@ dependencies = [ "common-macro", "common-telemetry", "lazy_static", + "num_cpus", "once_cell", "paste", "prometheus", + "serde", "snafu 0.8.3", "tokio", "tokio-metrics", @@ -11049,8 +11051,7 @@ dependencies = [ [[package]] name = "tokio-metrics-collector" version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767da47381602cc481653456823b3ebb600e83d5dd4e0293da9b5566c6c00f0" +source = "git+https://github.com/MichaelScofield/tokio-metrics-collector.git?rev=89d692d5753d28564a7aac73c6ac5aba22243ba0#89d692d5753d28564a7aac73c6ac5aba22243ba0" dependencies = [ "lazy_static", "parking_lot 0.12.3", diff --git a/config/config.md b/config/config.md index 912e8ca7508c..5c4878c9d37d 100644 --- a/config/config.md +++ b/config/config.md @@ -13,6 +13,10 @@ | `mode` | String | `standalone` | The running mode of the datanode. It can be `standalone` or `distributed`. | | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. | | `default_timezone` | String | `None` | The default timezone of the server. | +| `runtime` | -- | -- | The runtime options. | +| `runtime.read_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. | +| `runtime.write_rt_size` | Integer | `8` | The number of threads to execute the runtime for global write operations. | +| `runtime.bg_rt_size` | Integer | `8` | The number of threads to execute the runtime for global background operations. | | `http` | -- | -- | The HTTP server options. | | `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. | | `http.timeout` | String | `30s` | HTTP request timeout. | @@ -154,6 +158,10 @@ | --- | -----| ------- | ----------- | | `mode` | String | `standalone` | The running mode of the datanode. It can be `standalone` or `distributed`. | | `default_timezone` | String | `None` | The default timezone of the server. | +| `runtime` | -- | -- | The runtime options. | +| `runtime.read_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. | +| `runtime.write_rt_size` | Integer | `8` | The number of threads to execute the runtime for global write operations. | +| `runtime.bg_rt_size` | Integer | `8` | The number of threads to execute the runtime for global background operations. | | `heartbeat` | -- | -- | The heartbeat options. | | `heartbeat.interval` | String | `18s` | Interval for sending heartbeat messages to the metasrv. | | `heartbeat.retry_interval` | String | `3s` | Interval for retrying to send heartbeat messages to the metasrv. | @@ -240,6 +248,10 @@ | `use_memory_store` | Bool | `false` | Store data in memory. | | `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. | | `store_key_prefix` | String | `""` | If it's not empty, the metasrv will store all data with this key prefix. | +| `runtime` | -- | -- | The runtime options. | +| `runtime.read_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. | +| `runtime.write_rt_size` | Integer | `8` | The number of threads to execute the runtime for global write operations. | +| `runtime.bg_rt_size` | Integer | `8` | The number of threads to execute the runtime for global background operations. | | `procedure` | -- | -- | Procedure storage options. | | `procedure.max_retry_times` | Integer | `12` | Procedure max retry time. | | `procedure.retry_delay` | String | `500ms` | Initial retry delay of procedures, increases exponentially | @@ -300,6 +312,10 @@ | `rpc_max_recv_message_size` | String | `512MB` | The maximum receive message size for gRPC server. | | `rpc_max_send_message_size` | String | `512MB` | The maximum send message size for gRPC server. | | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. | +| `runtime` | -- | -- | The runtime options. | +| `runtime.read_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. | +| `runtime.write_rt_size` | Integer | `8` | The number of threads to execute the runtime for global write operations. | +| `runtime.bg_rt_size` | Integer | `8` | The number of threads to execute the runtime for global background operations. | | `heartbeat` | -- | -- | The heartbeat options. | | `heartbeat.interval` | String | `3s` | Interval for sending heartbeat messages to the metasrv. | | `heartbeat.retry_interval` | String | `3s` | Interval for retrying to send heartbeat messages to the metasrv. | diff --git a/config/datanode.example.toml b/config/datanode.example.toml index d1849048778c..3a20d3ac5f16 100644 --- a/config/datanode.example.toml +++ b/config/datanode.example.toml @@ -32,6 +32,15 @@ rpc_max_send_message_size = "512MB" ## Enable telemetry to collect anonymous usage data. enable_telemetry = true +## The runtime options. +[runtime] +## The number of threads to execute the runtime for global read operations. +read_rt_size = 8 +## The number of threads to execute the runtime for global write operations. +write_rt_size = 8 +## The number of threads to execute the runtime for global background operations. +bg_rt_size = 8 + ## The heartbeat options. [heartbeat] ## Interval for sending heartbeat messages to the metasrv. diff --git a/config/frontend.example.toml b/config/frontend.example.toml index 728a3099f837..4f4bd5bf3d3d 100644 --- a/config/frontend.example.toml +++ b/config/frontend.example.toml @@ -5,6 +5,15 @@ mode = "standalone" ## +toml2docs:none-default default_timezone = "UTC" +## The runtime options. +[runtime] +## The number of threads to execute the runtime for global read operations. +read_rt_size = 8 +## The number of threads to execute the runtime for global write operations. +write_rt_size = 8 +## The number of threads to execute the runtime for global background operations. +bg_rt_size = 8 + ## The heartbeat options. [heartbeat] ## Interval for sending heartbeat messages to the metasrv. diff --git a/config/metasrv.example.toml b/config/metasrv.example.toml index bc6a5d119342..239533bd5886 100644 --- a/config/metasrv.example.toml +++ b/config/metasrv.example.toml @@ -25,6 +25,15 @@ enable_telemetry = true ## If it's not empty, the metasrv will store all data with this key prefix. store_key_prefix = "" +## The runtime options. +[runtime] +## The number of threads to execute the runtime for global read operations. +read_rt_size = 8 +## The number of threads to execute the runtime for global write operations. +write_rt_size = 8 +## The number of threads to execute the runtime for global background operations. +bg_rt_size = 8 + ## Procedure storage options. [procedure] diff --git a/config/standalone.example.toml b/config/standalone.example.toml index 8386c7e1e61a..d6fcc3e8943e 100644 --- a/config/standalone.example.toml +++ b/config/standalone.example.toml @@ -8,6 +8,15 @@ enable_telemetry = true ## +toml2docs:none-default default_timezone = "UTC" +## The runtime options. +[runtime] +## The number of threads to execute the runtime for global read operations. +read_rt_size = 8 +## The number of threads to execute the runtime for global write operations. +write_rt_size = 8 +## The number of threads to execute the runtime for global background operations. +bg_rt_size = 8 + ## The HTTP server options. [http] ## The address to bind the HTTP server. diff --git a/src/cmd/src/datanode.rs b/src/cmd/src/datanode.rs index 3c189f2c3d07..d8680ed5294e 100644 --- a/src/cmd/src/datanode.rs +++ b/src/cmd/src/datanode.rs @@ -23,7 +23,6 @@ use common_telemetry::info; use common_telemetry::logging::TracingOptions; use common_version::{short_version, version}; use common_wal::config::DatanodeWalConfig; -use datanode::config::DatanodeOptions; use datanode::datanode::{Datanode, DatanodeBuilder}; use datanode::service::DatanodeServiceBuilder; use meta_client::MetaClientOptions; @@ -34,11 +33,13 @@ use tracing_appender::non_blocking::WorkerGuard; use crate::error::{ LoadLayeredConfigSnafu, MissingConfigSnafu, Result, ShutdownDatanodeSnafu, StartDatanodeSnafu, }; -use crate::options::GlobalOptions; +use crate::options::{GlobalOptions, GreptimeOptions}; use crate::{log_versions, App}; pub const APP_NAME: &str = "greptime-datanode"; +type DatanodeOptions = GreptimeOptions; + pub struct Instance { datanode: Datanode, @@ -97,7 +98,9 @@ impl Command { } pub fn load_options(&self, global_options: &GlobalOptions) -> Result { - self.subcmd.load_options(global_options) + match &self.subcmd { + SubCommand::Start(cmd) => cmd.load_options(global_options), + } } } @@ -112,12 +115,6 @@ impl SubCommand { SubCommand::Start(cmd) => cmd.build(opts).await, } } - - fn load_options(&self, global_options: &GlobalOptions) -> Result { - match self { - SubCommand::Start(cmd) => cmd.load_options(global_options), - } - } } #[derive(Debug, Parser, Default)] @@ -146,22 +143,25 @@ struct StartCommand { impl StartCommand { fn load_options(&self, global_options: &GlobalOptions) -> Result { - self.merge_with_cli_options( - global_options, - DatanodeOptions::load_layered_options( - self.config_file.as_deref(), - self.env_prefix.as_ref(), - ) - .context(LoadLayeredConfigSnafu)?, + let mut opts = DatanodeOptions::load_layered_options( + self.config_file.as_deref(), + self.env_prefix.as_ref(), ) + .context(LoadLayeredConfigSnafu)?; + + self.merge_with_cli_options(global_options, &mut opts)?; + + Ok(opts) } // The precedence order is: cli > config file > environment variables > default values. fn merge_with_cli_options( &self, global_options: &GlobalOptions, - mut opts: DatanodeOptions, - ) -> Result { + opts: &mut DatanodeOptions, + ) -> Result<()> { + let opts = &mut opts.component; + if let Some(dir) = &global_options.log_dir { opts.logging.dir.clone_from(dir); } @@ -231,25 +231,28 @@ impl StartCommand { // Disable dashboard in datanode. opts.http.disable_dashboard = true; - Ok(opts) + Ok(()) } - async fn build(&self, mut opts: DatanodeOptions) -> Result { + async fn build(&self, opts: DatanodeOptions) -> Result { + common_runtime::init_global_runtimes(&opts.runtime); + let guard = common_telemetry::init_global_logging( APP_NAME, - &opts.logging, - &opts.tracing, - opts.node_id.map(|x| x.to_string()), + &opts.component.logging, + &opts.component.tracing, + opts.component.node_id.map(|x| x.to_string()), ); log_versions(version!(), short_version!()); + info!("Datanode start command: {:#?}", self); + info!("Datanode options: {:#?}", opts); + + let mut opts = opts.component; let plugins = plugins::setup_datanode_plugins(&mut opts) .await .context(StartDatanodeSnafu)?; - info!("Datanode start command: {:#?}", self); - info!("Datanode options: {:#?}", opts); - let node_id = opts .node_id .context(MissingConfigSnafu { msg: "'node_id'" })?; @@ -353,7 +356,7 @@ mod tests { ..Default::default() }; - let options = cmd.load_options(&GlobalOptions::default()).unwrap(); + let options = cmd.load_options(&Default::default()).unwrap().component; assert_eq!("127.0.0.1:3001".to_string(), options.rpc_addr); assert_eq!(Some(42), options.node_id); @@ -414,7 +417,8 @@ mod tests { fn test_try_from_cmd() { let opt = StartCommand::default() .load_options(&GlobalOptions::default()) - .unwrap(); + .unwrap() + .component; assert_eq!(Mode::Standalone, opt.mode); let opt = (StartCommand { @@ -423,7 +427,8 @@ mod tests { ..Default::default() }) .load_options(&GlobalOptions::default()) - .unwrap(); + .unwrap() + .component; assert_eq!(Mode::Distributed, opt.mode); assert!((StartCommand { @@ -454,7 +459,8 @@ mod tests { #[cfg(feature = "tokio-console")] tokio_console_addr: None, }) - .unwrap(); + .unwrap() + .component; let logging_opt = options.logging; assert_eq!("/tmp/greptimedb/test/logs", logging_opt.dir); @@ -536,7 +542,7 @@ mod tests { ..Default::default() }; - let opts = command.load_options(&GlobalOptions::default()).unwrap(); + let opts = command.load_options(&Default::default()).unwrap().component; // Should be read from env, env > default values. let DatanodeWalConfig::RaftEngine(raft_engine_config) = opts.wal else { @@ -562,7 +568,10 @@ mod tests { assert_eq!(raft_engine_config.dir.unwrap(), "/other/wal/dir"); // Should be default value. - assert_eq!(opts.http.addr, DatanodeOptions::default().http.addr); + assert_eq!( + opts.http.addr, + DatanodeOptions::default().component.http.addr + ); }, ); } diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs index a3e744e9c7ec..a7781e37a2ed 100644 --- a/src/cmd/src/frontend.rs +++ b/src/cmd/src/frontend.rs @@ -29,7 +29,6 @@ use common_telemetry::info; use common_telemetry::logging::TracingOptions; use common_time::timezone::set_default_timezone; use common_version::{short_version, version}; -use frontend::frontend::FrontendOptions; use frontend::heartbeat::handler::invalidate_table_cache::InvalidateTableCacheHandler; use frontend::heartbeat::HeartbeatTask; use frontend::instance::builder::FrontendBuilder; @@ -44,9 +43,11 @@ use tracing_appender::non_blocking::WorkerGuard; use crate::error::{ self, InitTimezoneSnafu, LoadLayeredConfigSnafu, MissingConfigSnafu, Result, StartFrontendSnafu, }; -use crate::options::GlobalOptions; +use crate::options::{GlobalOptions, GreptimeOptions}; use crate::{log_versions, App}; +type FrontendOptions = GreptimeOptions; + pub struct Instance { frontend: FeInstance, @@ -164,22 +165,25 @@ pub struct StartCommand { impl StartCommand { fn load_options(&self, global_options: &GlobalOptions) -> Result { - self.merge_with_cli_options( - global_options, - FrontendOptions::load_layered_options( - self.config_file.as_deref(), - self.env_prefix.as_ref(), - ) - .context(LoadLayeredConfigSnafu)?, + let mut opts = FrontendOptions::load_layered_options( + self.config_file.as_deref(), + self.env_prefix.as_ref(), ) + .context(LoadLayeredConfigSnafu)?; + + self.merge_with_cli_options(global_options, &mut opts)?; + + Ok(opts) } // The precedence order is: cli > config file > environment variables > default values. fn merge_with_cli_options( &self, global_options: &GlobalOptions, - mut opts: FrontendOptions, - ) -> Result { + opts: &mut FrontendOptions, + ) -> Result<()> { + let opts = &mut opts.component; + if let Some(dir) = &global_options.log_dir { opts.logging.dir.clone_from(dir); } @@ -242,26 +246,29 @@ impl StartCommand { opts.user_provider.clone_from(&self.user_provider); - Ok(opts) + Ok(()) } - async fn build(&self, mut opts: FrontendOptions) -> Result { + async fn build(&self, opts: FrontendOptions) -> Result { + common_runtime::init_global_runtimes(&opts.runtime); + let guard = common_telemetry::init_global_logging( APP_NAME, - &opts.logging, - &opts.tracing, - opts.node_id.clone(), + &opts.component.logging, + &opts.component.tracing, + opts.component.node_id.clone(), ); log_versions(version!(), short_version!()); + info!("Frontend start command: {:#?}", self); + info!("Frontend options: {:#?}", opts); + + let mut opts = opts.component; #[allow(clippy::unnecessary_mut_passed)] let plugins = plugins::setup_frontend_plugins(&mut opts) .await .context(StartFrontendSnafu)?; - info!("Frontend start command: {:#?}", self); - info!("Frontend options: {:#?}", opts); - set_default_timezone(opts.default_timezone.as_deref()).context(InitTimezoneSnafu)?; let meta_client_options = opts.meta_client.as_ref().context(MissingConfigSnafu { @@ -380,14 +387,14 @@ mod tests { ..Default::default() }; - let opts = command.load_options(&GlobalOptions::default()).unwrap(); + let opts = command.load_options(&Default::default()).unwrap().component; assert_eq!(opts.http.addr, "127.0.0.1:1234"); assert_eq!(ReadableSize::mb(64), opts.http.body_limit); assert_eq!(opts.mysql.addr, "127.0.0.1:5678"); assert_eq!(opts.postgres.addr, "127.0.0.1:5432"); - let default_opts = FrontendOptions::default(); + let default_opts = FrontendOptions::default().component; assert_eq!(opts.grpc.addr, default_opts.grpc.addr); assert!(opts.mysql.enable); @@ -428,7 +435,8 @@ mod tests { ..Default::default() }; - let fe_opts = command.load_options(&GlobalOptions::default()).unwrap(); + let fe_opts = command.load_options(&Default::default()).unwrap().component; + assert_eq!(Mode::Distributed, fe_opts.mode); assert_eq!("127.0.0.1:4000".to_string(), fe_opts.http.addr); assert_eq!(Duration::from_secs(30), fe_opts.http.timeout); @@ -442,7 +450,7 @@ mod tests { #[tokio::test] async fn test_try_from_start_command_to_anymap() { - let mut fe_opts = FrontendOptions { + let mut fe_opts = frontend::frontend::FrontendOptions { http: HttpOptions { disable_dashboard: false, ..Default::default() @@ -479,7 +487,8 @@ mod tests { #[cfg(feature = "tokio-console")] tokio_console_addr: None, }) - .unwrap(); + .unwrap() + .component; let logging_opt = options.logging; assert_eq!("/tmp/greptimedb/test/logs", logging_opt.dir); @@ -557,7 +566,7 @@ mod tests { ..Default::default() }; - let fe_opts = command.load_options(&GlobalOptions::default()).unwrap(); + let fe_opts = command.load_options(&Default::default()).unwrap().component; // Should be read from env, env > default values. assert_eq!(fe_opts.mysql.runtime_size, 11); diff --git a/src/cmd/src/metasrv.rs b/src/cmd/src/metasrv.rs index 8648f220f3ac..3b89fdce112e 100644 --- a/src/cmd/src/metasrv.rs +++ b/src/cmd/src/metasrv.rs @@ -21,14 +21,15 @@ use common_telemetry::info; use common_telemetry::logging::TracingOptions; use common_version::{short_version, version}; use meta_srv::bootstrap::MetasrvInstance; -use meta_srv::metasrv::MetasrvOptions; use snafu::ResultExt; use tracing_appender::non_blocking::WorkerGuard; use crate::error::{self, LoadLayeredConfigSnafu, Result, StartMetaServerSnafu}; -use crate::options::GlobalOptions; +use crate::options::{GlobalOptions, GreptimeOptions}; use crate::{log_versions, App}; +type MetasrvOptions = GreptimeOptions; + pub const APP_NAME: &str = "greptime-metasrv"; pub struct Instance { @@ -139,22 +140,25 @@ struct StartCommand { impl StartCommand { fn load_options(&self, global_options: &GlobalOptions) -> Result { - self.merge_with_cli_options( - global_options, - MetasrvOptions::load_layered_options( - self.config_file.as_deref(), - self.env_prefix.as_ref(), - ) - .context(LoadLayeredConfigSnafu)?, + let mut opts = MetasrvOptions::load_layered_options( + self.config_file.as_deref(), + self.env_prefix.as_ref(), ) + .context(LoadLayeredConfigSnafu)?; + + self.merge_with_cli_options(global_options, &mut opts)?; + + Ok(opts) } // The precedence order is: cli > config file > environment variables > default values. fn merge_with_cli_options( &self, global_options: &GlobalOptions, - mut opts: MetasrvOptions, - ) -> Result { + opts: &mut MetasrvOptions, + ) -> Result<()> { + let opts = &mut opts.component; + if let Some(dir) = &global_options.log_dir { opts.logging.dir.clone_from(dir); } @@ -217,21 +221,28 @@ impl StartCommand { // Disable dashboard in metasrv. opts.http.disable_dashboard = true; - Ok(opts) + Ok(()) } - async fn build(&self, mut opts: MetasrvOptions) -> Result { - let guard = - common_telemetry::init_global_logging(APP_NAME, &opts.logging, &opts.tracing, None); + async fn build(&self, opts: MetasrvOptions) -> Result { + common_runtime::init_global_runtimes(&opts.runtime); + + let guard = common_telemetry::init_global_logging( + APP_NAME, + &opts.component.logging, + &opts.component.tracing, + None, + ); log_versions(version!(), short_version!()); + info!("Metasrv start command: {:#?}", self); + info!("Metasrv options: {:#?}", opts); + + let mut opts = opts.component; let plugins = plugins::setup_metasrv_plugins(&mut opts) .await .context(StartMetaServerSnafu)?; - info!("Metasrv start command: {:#?}", self); - info!("Metasrv options: {:#?}", opts); - let builder = meta_srv::bootstrap::metasrv_builder(&opts, plugins.clone(), None) .await .context(error::BuildMetaServerSnafu)?; @@ -266,7 +277,7 @@ mod tests { ..Default::default() }; - let options = cmd.load_options(&GlobalOptions::default()).unwrap(); + let options = cmd.load_options(&Default::default()).unwrap().component; assert_eq!("127.0.0.1:3002".to_string(), options.bind_addr); assert_eq!(vec!["127.0.0.1:2380".to_string()], options.store_addrs); assert_eq!(SelectorType::LoadBased, options.selector); @@ -299,7 +310,7 @@ mod tests { ..Default::default() }; - let options = cmd.load_options(&GlobalOptions::default()).unwrap(); + let options = cmd.load_options(&Default::default()).unwrap().component; assert_eq!("127.0.0.1:3002".to_string(), options.bind_addr); assert_eq!("127.0.0.1:3002".to_string(), options.server_addr); assert_eq!(vec!["127.0.0.1:2379".to_string()], options.store_addrs); @@ -349,7 +360,8 @@ mod tests { #[cfg(feature = "tokio-console")] tokio_console_addr: None, }) - .unwrap(); + .unwrap() + .component; let logging_opt = options.logging; assert_eq!("/tmp/greptimedb/test/logs", logging_opt.dir); @@ -406,7 +418,7 @@ mod tests { ..Default::default() }; - let opts = command.load_options(&GlobalOptions::default()).unwrap(); + let opts = command.load_options(&Default::default()).unwrap().component; // Should be read from env, env > default values. assert_eq!(opts.bind_addr, "127.0.0.1:14002"); diff --git a/src/cmd/src/options.rs b/src/cmd/src/options.rs index 03ccbc536247..26ac9203a225 100644 --- a/src/cmd/src/options.rs +++ b/src/cmd/src/options.rs @@ -13,6 +13,9 @@ // limitations under the License. use clap::Parser; +use common_config::Configurable; +use common_runtime::global::RuntimeOptions; +use serde::{Deserialize, Serialize}; #[derive(Parser, Default, Debug, Clone)] pub struct GlobalOptions { @@ -29,3 +32,22 @@ pub struct GlobalOptions { #[arg(global = true)] pub tokio_console_addr: Option, } + +// TODO(LFC): Move logging and tracing options into global options, like the runtime options. +/// All the options of GreptimeDB. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +#[serde(default)] +pub struct GreptimeOptions { + /// The runtime options. + pub runtime: RuntimeOptions, + + /// The options of each component (like Datanode or Standalone) of GreptimeDB. + #[serde(flatten)] + pub component: T, +} + +impl Configurable for GreptimeOptions { + fn env_list_keys() -> Option<&'static [&'static str]> { + T::env_list_keys() + } +} diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index 90958baf1048..e1ac35c98b06 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -67,7 +67,7 @@ use crate::error::{ ShutdownFrontendSnafu, StartDatanodeSnafu, StartFrontendSnafu, StartProcedureManagerSnafu, StartWalOptionsAllocatorSnafu, StopProcedureManagerSnafu, }; -use crate::options::GlobalOptions; +use crate::options::{GlobalOptions, GreptimeOptions}; use crate::{log_versions, App}; pub const APP_NAME: &str = "greptime-standalone"; @@ -79,11 +79,14 @@ pub struct Command { } impl Command { - pub async fn build(&self, opts: StandaloneOptions) -> Result { + pub async fn build(&self, opts: GreptimeOptions) -> Result { self.subcmd.build(opts).await } - pub fn load_options(&self, global_options: &GlobalOptions) -> Result { + pub fn load_options( + &self, + global_options: &GlobalOptions, + ) -> Result> { self.subcmd.load_options(global_options) } } @@ -94,20 +97,23 @@ enum SubCommand { } impl SubCommand { - async fn build(&self, opts: StandaloneOptions) -> Result { + async fn build(&self, opts: GreptimeOptions) -> Result { match self { SubCommand::Start(cmd) => cmd.build(opts).await, } } - fn load_options(&self, global_options: &GlobalOptions) -> Result { + fn load_options( + &self, + global_options: &GlobalOptions, + ) -> Result> { match self { SubCommand::Start(cmd) => cmd.load_options(global_options), } } } -#[derive(Clone, Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] #[serde(default)] pub struct StandaloneOptions { pub mode: Mode, @@ -161,7 +167,7 @@ impl Default for StandaloneOptions { } } -impl Configurable<'_> for StandaloneOptions { +impl Configurable for StandaloneOptions { fn env_list_keys() -> Option<&'static [&'static str]> { Some(&["wal.broker_endpoints"]) } @@ -291,23 +297,27 @@ pub struct StartCommand { } impl StartCommand { - fn load_options(&self, global_options: &GlobalOptions) -> Result { - self.merge_with_cli_options( - global_options, - StandaloneOptions::load_layered_options( - self.config_file.as_deref(), - self.env_prefix.as_ref(), - ) - .context(LoadLayeredConfigSnafu)?, + fn load_options( + &self, + global_options: &GlobalOptions, + ) -> Result> { + let mut opts = GreptimeOptions::::load_layered_options( + self.config_file.as_deref(), + self.env_prefix.as_ref(), ) + .context(LoadLayeredConfigSnafu)?; + + self.merge_with_cli_options(global_options, &mut opts.component)?; + + Ok(opts) } // The precedence order is: cli > config file > environment variables > default values. pub fn merge_with_cli_options( &self, global_options: &GlobalOptions, - mut opts: StandaloneOptions, - ) -> Result { + opts: &mut StandaloneOptions, + ) -> Result<()> { // Should always be standalone mode. opts.mode = Mode::Standalone; @@ -369,20 +379,27 @@ impl StartCommand { opts.user_provider.clone_from(&self.user_provider); - Ok(opts) + Ok(()) } #[allow(unreachable_code)] #[allow(unused_variables)] #[allow(clippy::diverging_sub_expression)] - async fn build(&self, opts: StandaloneOptions) -> Result { - let guard = - common_telemetry::init_global_logging(APP_NAME, &opts.logging, &opts.tracing, None); + async fn build(&self, opts: GreptimeOptions) -> Result { + common_runtime::init_global_runtimes(&opts.runtime); + + let guard = common_telemetry::init_global_logging( + APP_NAME, + &opts.component.logging, + &opts.component.tracing, + None, + ); log_versions(version!(), short_version!()); info!("Standalone start command: {:#?}", self); - info!("Building standalone instance with {opts:#?}"); + info!("Standalone options: {opts:#?}"); + let opts = opts.component; let mut fe_opts = opts.frontend_options(); #[allow(clippy::unnecessary_mut_passed)] let fe_plugins = plugins::setup_frontend_plugins(&mut fe_opts) // mut ref is MUST, DO NOT change it @@ -664,7 +681,10 @@ mod tests { ..Default::default() }; - let options = cmd.load_options(&GlobalOptions::default()).unwrap(); + let options = cmd + .load_options(&GlobalOptions::default()) + .unwrap() + .component; let fe_opts = options.frontend_options(); let dn_opts = options.datanode_options(); let logging_opts = options.logging; @@ -725,7 +745,8 @@ mod tests { #[cfg(feature = "tokio-console")] tokio_console_addr: None, }) - .unwrap(); + .unwrap() + .component; assert_eq!("/tmp/greptimedb/test/logs", opts.logging.dir); assert_eq!("debug", opts.logging.level.unwrap()); @@ -787,7 +808,7 @@ mod tests { ..Default::default() }; - let opts = command.load_options(&GlobalOptions::default()).unwrap(); + let opts = command.load_options(&Default::default()).unwrap().component; // Should be read from env, env > default values. assert_eq!(opts.logging.dir, "/other/log/dir"); diff --git a/src/cmd/tests/load_config_test.rs b/src/cmd/tests/load_config_test.rs new file mode 100644 index 000000000000..80075b846e51 --- /dev/null +++ b/src/cmd/tests/load_config_test.rs @@ -0,0 +1,218 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::Duration; + +use cmd::options::GreptimeOptions; +use cmd::standalone::StandaloneOptions; +use common_config::Configurable; +use common_runtime::global::RuntimeOptions; +use common_telemetry::logging::LoggingOptions; +use common_wal::config::raft_engine::RaftEngineConfig; +use common_wal::config::{DatanodeWalConfig, StandaloneWalConfig}; +use datanode::config::{DatanodeOptions, RegionEngineConfig, StorageConfig}; +use frontend::frontend::FrontendOptions; +use frontend::service_config::datanode::DatanodeClientOptions; +use meta_client::MetaClientOptions; +use meta_srv::metasrv::MetasrvOptions; +use meta_srv::selector::SelectorType; +use mito2::config::MitoConfig; +use servers::export_metrics::ExportMetricsOption; + +#[test] +fn test_load_datanode_example_config() { + let example_config = common_test_util::find_workspace_path("config/datanode.example.toml"); + let options = + GreptimeOptions::::load_layered_options(example_config.to_str(), "") + .unwrap(); + + let expected = GreptimeOptions:: { + runtime: RuntimeOptions { + read_rt_size: 8, + write_rt_size: 8, + bg_rt_size: 8, + }, + component: DatanodeOptions { + node_id: Some(42), + rpc_hostname: Some("127.0.0.1".to_string()), + meta_client: Some(MetaClientOptions { + metasrv_addrs: vec!["127.0.0.1:3002".to_string()], + timeout: Duration::from_secs(3), + heartbeat_timeout: Duration::from_millis(500), + ddl_timeout: Duration::from_secs(10), + connect_timeout: Duration::from_secs(1), + tcp_nodelay: true, + metadata_cache_max_capacity: 100000, + metadata_cache_ttl: Duration::from_secs(600), + metadata_cache_tti: Duration::from_secs(300), + }), + wal: DatanodeWalConfig::RaftEngine(RaftEngineConfig { + dir: Some("/tmp/greptimedb/wal".to_string()), + sync_period: Some(Duration::from_secs(10)), + ..Default::default() + }), + storage: StorageConfig { + data_home: "/tmp/greptimedb/".to_string(), + ..Default::default() + }, + region_engine: vec![RegionEngineConfig::Mito(MitoConfig { + num_workers: 8, + auto_flush_interval: Duration::from_secs(3600), + scan_parallelism: 0, + ..Default::default() + })], + logging: LoggingOptions { + level: Some("info".to_string()), + otlp_endpoint: Some("".to_string()), + tracing_sample_ratio: Some(Default::default()), + ..Default::default() + }, + export_metrics: ExportMetricsOption { + self_import: Some(Default::default()), + remote_write: Some(Default::default()), + ..Default::default() + }, + ..Default::default() + }, + }; + + assert_eq!(options, expected); +} + +#[test] +fn test_load_frontend_example_config() { + let example_config = common_test_util::find_workspace_path("config/frontend.example.toml"); + let options = + GreptimeOptions::::load_layered_options(example_config.to_str(), "") + .unwrap(); + let expected = GreptimeOptions:: { + runtime: RuntimeOptions { + read_rt_size: 8, + write_rt_size: 8, + bg_rt_size: 8, + }, + component: FrontendOptions { + default_timezone: Some("UTC".to_string()), + meta_client: Some(MetaClientOptions { + metasrv_addrs: vec!["127.0.0.1:3002".to_string()], + timeout: Duration::from_secs(3), + heartbeat_timeout: Duration::from_millis(500), + ddl_timeout: Duration::from_secs(10), + connect_timeout: Duration::from_secs(1), + tcp_nodelay: true, + metadata_cache_max_capacity: 100000, + metadata_cache_ttl: Duration::from_secs(600), + metadata_cache_tti: Duration::from_secs(300), + }), + logging: LoggingOptions { + level: Some("info".to_string()), + otlp_endpoint: Some("".to_string()), + tracing_sample_ratio: Some(Default::default()), + ..Default::default() + }, + datanode: frontend::service_config::DatanodeOptions { + client: DatanodeClientOptions { + connect_timeout: Duration::from_secs(10), + tcp_nodelay: true, + }, + }, + export_metrics: ExportMetricsOption { + self_import: Some(Default::default()), + remote_write: Some(Default::default()), + ..Default::default() + }, + ..Default::default() + }, + }; + assert_eq!(options, expected); +} + +#[test] +fn test_load_metasrv_example_config() { + let example_config = common_test_util::find_workspace_path("config/metasrv.example.toml"); + let options = + GreptimeOptions::::load_layered_options(example_config.to_str(), "") + .unwrap(); + let expected = GreptimeOptions:: { + runtime: RuntimeOptions { + read_rt_size: 8, + write_rt_size: 8, + bg_rt_size: 8, + }, + component: MetasrvOptions { + selector: SelectorType::LeaseBased, + data_home: "/tmp/metasrv/".to_string(), + logging: LoggingOptions { + dir: "/tmp/greptimedb/logs".to_string(), + level: Some("info".to_string()), + otlp_endpoint: Some("".to_string()), + tracing_sample_ratio: Some(Default::default()), + ..Default::default() + }, + export_metrics: ExportMetricsOption { + self_import: Some(Default::default()), + remote_write: Some(Default::default()), + ..Default::default() + }, + ..Default::default() + }, + }; + assert_eq!(options, expected); +} + +#[test] +fn test_load_standalone_example_config() { + let example_config = common_test_util::find_workspace_path("config/standalone.example.toml"); + let options = + GreptimeOptions::::load_layered_options(example_config.to_str(), "") + .unwrap(); + let expected = GreptimeOptions:: { + runtime: RuntimeOptions { + read_rt_size: 8, + write_rt_size: 8, + bg_rt_size: 8, + }, + component: StandaloneOptions { + default_timezone: Some("UTC".to_string()), + wal: StandaloneWalConfig::RaftEngine(RaftEngineConfig { + dir: Some("/tmp/greptimedb/wal".to_string()), + sync_period: Some(Duration::from_secs(10)), + ..Default::default() + }), + region_engine: vec![RegionEngineConfig::Mito(MitoConfig { + num_workers: 8, + auto_flush_interval: Duration::from_secs(3600), + scan_parallelism: 0, + ..Default::default() + })], + storage: StorageConfig { + data_home: "/tmp/greptimedb/".to_string(), + ..Default::default() + }, + logging: LoggingOptions { + level: Some("info".to_string()), + otlp_endpoint: Some("".to_string()), + tracing_sample_ratio: Some(Default::default()), + ..Default::default() + }, + export_metrics: ExportMetricsOption { + self_import: Some(Default::default()), + remote_write: Some(Default::default()), + ..Default::default() + }, + ..Default::default() + }, + }; + assert_eq!(options, expected); +} diff --git a/src/common/config/src/config.rs b/src/common/config/src/config.rs index c21735a059ea..e0816fbd5671 100644 --- a/src/common/config/src/config.rs +++ b/src/common/config/src/config.rs @@ -13,7 +13,8 @@ // limitations under the License. use config::{Environment, File, FileFormat}; -use serde::{Deserialize, Serialize}; +use serde::de::DeserializeOwned; +use serde::Serialize; use snafu::ResultExt; use crate::error::{LoadLayeredConfigSnafu, Result, SerdeJsonSnafu, TomlFormatSnafu}; @@ -25,7 +26,7 @@ pub const ENV_VAR_SEP: &str = "__"; pub const ENV_LIST_SEP: &str = ","; /// Configuration trait defines the common interface for configuration that can be loaded from multiple sources and serialized to TOML. -pub trait Configurable<'de>: Serialize + Deserialize<'de> + Default + Sized { +pub trait Configurable: Serialize + DeserializeOwned + Default + Sized { /// Load the configuration from multiple sources and merge them. /// The precedence order is: config file > environment variables > default values. /// `env_prefix` is the prefix of environment variables, e.g. "FRONTEND__xxx". @@ -128,7 +129,7 @@ mod tests { } } - impl Configurable<'_> for TestDatanodeConfig { + impl Configurable for TestDatanodeConfig { fn env_list_keys() -> Option<&'static [&'static str]> { Some(&["meta_client.metasrv_addrs"]) } diff --git a/src/common/runtime/Cargo.toml b/src/common/runtime/Cargo.toml index a6da1f571fc2..e5fa276c4bf1 100644 --- a/src/common/runtime/Cargo.toml +++ b/src/common/runtime/Cargo.toml @@ -13,13 +13,15 @@ common-error.workspace = true common-macro.workspace = true common-telemetry.workspace = true lazy_static.workspace = true +num_cpus.workspace = true once_cell.workspace = true paste.workspace = true prometheus.workspace = true +serde.workspace = true snafu.workspace = true tokio.workspace = true tokio-metrics = "0.3" -tokio-metrics-collector = "0.2" +tokio-metrics-collector = { git = "https://github.com/MichaelScofield/tokio-metrics-collector.git", rev = "89d692d5753d28564a7aac73c6ac5aba22243ba0" } tokio-util.workspace = true [dev-dependencies] diff --git a/src/common/runtime/src/global.rs b/src/common/runtime/src/global.rs index 51bad13107c7..6b21851e1680 100644 --- a/src/common/runtime/src/global.rs +++ b/src/common/runtime/src/global.rs @@ -19,6 +19,7 @@ use std::sync::{Mutex, Once}; use common_telemetry::info; use once_cell::sync::Lazy; use paste::paste; +use serde::{Deserialize, Serialize}; use crate::{Builder, JoinHandle, Runtime}; @@ -26,6 +27,28 @@ const READ_WORKERS: usize = 8; const WRITE_WORKERS: usize = 8; const BG_WORKERS: usize = 8; +/// The options for the global runtimes. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct RuntimeOptions { + /// The number of threads to execute the runtime for global read operations. + pub read_rt_size: usize, + /// The number of threads to execute the runtime for global write operations. + pub write_rt_size: usize, + /// The number of threads to execute the runtime for global background operations. + pub bg_rt_size: usize, +} + +impl Default for RuntimeOptions { + fn default() -> Self { + let cpus = num_cpus::get(); + Self { + read_rt_size: cpus, + write_rt_size: cpus, + bg_rt_size: cpus, + } + } +} + pub fn create_runtime(runtime_name: &str, thread_name: &str, worker_threads: usize) -> Runtime { info!("Creating runtime with runtime_name: {runtime_name}, thread_name: {thread_name}, work_threads: {worker_threads}."); Builder::default() @@ -112,18 +135,26 @@ static CONFIG_RUNTIMES: Lazy> = /// # Panics /// Panics when the global runtimes are already initialized. /// You should call this function before using any runtime functions. -pub fn init_global_runtimes( - read: Option, - write: Option, - background: Option, -) { +pub fn init_global_runtimes(options: &RuntimeOptions) { static START: Once = Once::new(); START.call_once(move || { let mut c = CONFIG_RUNTIMES.lock().unwrap(); assert!(!c.already_init, "Global runtimes already initialized"); - c.read_runtime = read; - c.write_runtime = write; - c.bg_runtime = background; + c.read_runtime = Some(create_runtime( + "global-read", + "global-read-worker", + options.read_rt_size, + )); + c.write_runtime = Some(create_runtime( + "global-write", + "global-write-worker", + options.write_rt_size, + )); + c.bg_runtime = Some(create_runtime( + "global-bg", + "global-bg-worker", + options.bg_rt_size, + )); }); } diff --git a/src/common/runtime/src/lib.rs b/src/common/runtime/src/lib.rs index 08baed46cbd3..ba6f74c96cc6 100644 --- a/src/common/runtime/src/lib.rs +++ b/src/common/runtime/src/lib.rs @@ -13,7 +13,7 @@ // limitations under the License. pub mod error; -mod global; +pub mod global; mod metrics; mod repeated_task; pub mod runtime; diff --git a/src/common/test-util/Cargo.toml b/src/common/test-util/Cargo.toml index 2b66dd45ce3a..b8084a2a8b3e 100644 --- a/src/common/test-util/Cargo.toml +++ b/src/common/test-util/Cargo.toml @@ -8,7 +8,7 @@ license.workspace = true workspace = true [dependencies] -client.workspace = true +client = { workspace = true, features = ["testing"] } common-query.workspace = true common-recordbatch.workspace = true once_cell.workspace = true diff --git a/src/common/test-util/src/recordbatch.rs b/src/common/test-util/src/recordbatch.rs index 47c949d40715..eb666e167a31 100644 --- a/src/common/test-util/src/recordbatch.rs +++ b/src/common/test-util/src/recordbatch.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use client::Database; use common_query::OutputData; use common_recordbatch::util; @@ -29,3 +30,25 @@ pub async fn check_output_stream(output: OutputData, expected: &str) { let pretty_print = recordbatches.pretty_print().unwrap(); assert_eq!(pretty_print, expected, "actual: \n{}", pretty_print); } + +pub async fn execute_and_check_output(db: &Database, sql: &str, expected: ExpectedOutput<'_>) { + let output = db.sql(sql).await.unwrap(); + let output = output.data; + + match (&output, expected) { + (OutputData::AffectedRows(x), ExpectedOutput::AffectedRows(y)) => { + assert_eq!( + *x, y, + r#" +expected: {y} +actual: {x} +"# + ) + } + (OutputData::RecordBatches(_), ExpectedOutput::QueryResult(x)) + | (OutputData::Stream(_), ExpectedOutput::QueryResult(x)) => { + check_output_stream(output, x).await + } + _ => panic!(), + } +} diff --git a/src/datanode/src/config.rs b/src/datanode/src/config.rs index ec278d3c4247..7e76c7d68169 100644 --- a/src/datanode/src/config.rs +++ b/src/datanode/src/config.rs @@ -15,7 +15,7 @@ //! Datanode configurations use common_base::readable_size::ReadableSize; -use common_base::secrets::SecretString; +use common_base::secrets::{ExposeSecret, SecretString}; use common_config::Configurable; use common_grpc::channel_manager::{ DEFAULT_MAX_GRPC_RECV_MESSAGE_SIZE, DEFAULT_MAX_GRPC_SEND_MESSAGE_SIZE, @@ -38,7 +38,7 @@ pub const DEFAULT_OBJECT_STORE_CACHE_SIZE: ReadableSize = ReadableSize::mb(256); const DEFAULT_DATA_HOME: &str = "/tmp/greptimedb"; /// Object storage config -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(tag = "type")] pub enum ObjectStoreConfig { File(FileConfig), @@ -61,7 +61,7 @@ impl ObjectStoreConfig { } /// Storage engine config -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(default)] pub struct StorageConfig { /// The working directory of database @@ -85,7 +85,7 @@ impl Default for StorageConfig { #[serde(default)] pub struct FileConfig {} -#[derive(Debug, Clone, Serialize, Deserialize, Default)] +#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)] #[serde(default)] pub struct ObjectStorageCacheConfig { /// The local file cache directory @@ -109,6 +109,18 @@ pub struct S3Config { pub cache: ObjectStorageCacheConfig, } +impl PartialEq for S3Config { + fn eq(&self, other: &Self) -> bool { + self.bucket == other.bucket + && self.root == other.root + && self.access_key_id.expose_secret() == other.access_key_id.expose_secret() + && self.secret_access_key.expose_secret() == other.secret_access_key.expose_secret() + && self.endpoint == other.endpoint + && self.region == other.region + && self.cache == other.cache + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(default)] pub struct OssConfig { @@ -123,6 +135,17 @@ pub struct OssConfig { pub cache: ObjectStorageCacheConfig, } +impl PartialEq for OssConfig { + fn eq(&self, other: &Self) -> bool { + self.bucket == other.bucket + && self.root == other.root + && self.access_key_id.expose_secret() == other.access_key_id.expose_secret() + && self.access_key_secret.expose_secret() == other.access_key_secret.expose_secret() + && self.endpoint == other.endpoint + && self.cache == other.cache + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(default)] pub struct AzblobConfig { @@ -138,6 +161,18 @@ pub struct AzblobConfig { pub cache: ObjectStorageCacheConfig, } +impl PartialEq for AzblobConfig { + fn eq(&self, other: &Self) -> bool { + self.container == other.container + && self.root == other.root + && self.account_name.expose_secret() == other.account_name.expose_secret() + && self.account_key.expose_secret() == other.account_key.expose_secret() + && self.endpoint == other.endpoint + && self.sas_token == other.sas_token + && self.cache == other.cache + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(default)] pub struct GcsConfig { @@ -151,6 +186,17 @@ pub struct GcsConfig { pub cache: ObjectStorageCacheConfig, } +impl PartialEq for GcsConfig { + fn eq(&self, other: &Self) -> bool { + self.root == other.root + && self.bucket == other.bucket + && self.scope == other.scope + && self.credential_path.expose_secret() == other.credential_path.expose_secret() + && self.endpoint == other.endpoint + && self.cache == other.cache + } +} + impl Default for S3Config { fn default() -> Self { Self { @@ -211,7 +257,7 @@ impl Default for ObjectStoreConfig { } } -#[derive(Clone, Debug, Serialize, Deserialize)] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] #[serde(default)] pub struct DatanodeOptions { pub mode: Mode, @@ -267,7 +313,7 @@ impl Default for DatanodeOptions { } } -impl Configurable<'_> for DatanodeOptions { +impl Configurable for DatanodeOptions { fn env_list_keys() -> Option<&'static [&'static str]> { Some(&["meta_client.metasrv_addrs", "wal.broker_endpoints"]) } diff --git a/src/frontend/src/frontend.rs b/src/frontend/src/frontend.rs index f0dfac1c7d5c..7907ff20ffe0 100644 --- a/src/frontend/src/frontend.rs +++ b/src/frontend/src/frontend.rs @@ -74,7 +74,7 @@ impl Default for FrontendOptions { } } -impl Configurable<'_> for FrontendOptions { +impl Configurable for FrontendOptions { fn env_list_keys() -> Option<&'static [&'static str]> { Some(&["meta_client.metasrv_addrs"]) } diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs index a1cc8934270f..c04770313a11 100644 --- a/src/frontend/src/instance.rs +++ b/src/frontend/src/instance.rs @@ -188,7 +188,7 @@ impl Instance { pub fn build_servers( &mut self, - opts: impl Into + for<'de> Configurable<'de>, + opts: impl Into + Configurable, servers: ServerHandlers, ) -> Result<()> { let opts: FrontendOptions = opts.into(); diff --git a/src/frontend/src/server.rs b/src/frontend/src/server.rs index f5a0afb53016..268bc7db4ae6 100644 --- a/src/frontend/src/server.rs +++ b/src/frontend/src/server.rs @@ -39,7 +39,7 @@ use crate::service_config::GrpcOptions; pub struct Services where - T: Into + for<'de> Configurable<'de> + Clone, + T: Into + Configurable + Clone, U: FrontendInstance, { opts: T, @@ -51,7 +51,7 @@ where impl Services where - T: Into + for<'de> Configurable<'de> + Clone, + T: Into + Configurable + Clone, U: FrontendInstance, { pub fn new(opts: T, instance: Arc, plugins: Plugins) -> Self { diff --git a/src/frontend/src/service_config/datanode.rs b/src/frontend/src/service_config/datanode.rs index ccf2b2ebf4c7..3b4de67b48c1 100644 --- a/src/frontend/src/service_config/datanode.rs +++ b/src/frontend/src/service_config/datanode.rs @@ -19,7 +19,7 @@ use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] pub struct DatanodeOptions { - client: DatanodeClientOptions, + pub client: DatanodeClientOptions, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs index 76fb794f797c..ce812cfba80f 100644 --- a/src/meta-srv/src/metasrv.rs +++ b/src/meta-srv/src/metasrv.rs @@ -148,7 +148,7 @@ impl Default for MetasrvOptions { } } -impl Configurable<'_> for MetasrvOptions { +impl Configurable for MetasrvOptions { fn env_list_keys() -> Option<&'static [&'static str]> { Some(&["wal.broker_endpoints"]) } diff --git a/tests-integration/src/test_util.rs b/tests-integration/src/test_util.rs index 7cbd640820b1..4c1b4641c9f4 100644 --- a/tests-integration/src/test_util.rs +++ b/tests-integration/src/test_util.rs @@ -21,16 +21,13 @@ use std::time::Duration; use auth::UserProviderRef; use axum::Router; use catalog::kvbackend::KvBackendCatalogManager; -use client::Database; use common_base::secrets::ExposeSecret; use common_config::Configurable; use common_meta::key::catalog_name::CatalogNameKey; use common_meta::key::schema_name::SchemaNameKey; -use common_query::OutputData; use common_runtime::Builder as RuntimeBuilder; use common_telemetry::warn; use common_test_util::ports; -use common_test_util::recordbatch::{check_output_stream, ExpectedOutput}; use common_test_util::temp_dir::{create_temp_dir, TempDir}; use common_wal::config::DatanodeWalConfig; use datanode::config::{ @@ -690,25 +687,3 @@ where test(endpoints).await } - -pub async fn execute_and_check_output(db: &Database, sql: &str, expected: ExpectedOutput<'_>) { - let output = db.sql(sql).await.unwrap(); - let output = output.data; - - match (&output, expected) { - (OutputData::AffectedRows(x), ExpectedOutput::AffectedRows(y)) => { - assert_eq!( - *x, y, - r#" -expected: {y} -actual: {x} -"# - ) - } - (OutputData::RecordBatches(_), ExpectedOutput::QueryResult(x)) - | (OutputData::Stream(_), ExpectedOutput::QueryResult(x)) => { - check_output_stream(output, x).await - } - _ => panic!(), - } -} diff --git a/tests-integration/tests/grpc.rs b/tests-integration/tests/grpc.rs index 7d1f9d57768f..33332170db16 100644 --- a/tests-integration/tests/grpc.rs +++ b/tests-integration/tests/grpc.rs @@ -28,6 +28,7 @@ use common_grpc::channel_manager::ClientTlsOption; use common_query::Output; use common_recordbatch::RecordBatches; use common_runtime::Runtime; +use common_test_util::find_workspace_path; use servers::grpc::builder::GrpcServerBuilder; use servers::grpc::GrpcServerConfig; use servers::http::prometheus::{ @@ -732,10 +733,7 @@ async fn to_batch(output: Output) -> String { } pub async fn test_grpc_tls_config(store_type: StorageType) { - let comm_dir = std::path::PathBuf::from_iter([ - std::env!("CARGO_RUSTC_CURRENT_DIR"), - "src/common/grpc/tests/tls", - ]); + let comm_dir = find_workspace_path("/src/common/grpc/tests/tls"); let ca_path = comm_dir.join("ca.pem").to_str().unwrap().to_string(); let server_cert_path = comm_dir.join("server.pem").to_str().unwrap().to_string(); let server_key_path = comm_dir.join("server.key").to_str().unwrap().to_string(); From 423e51e60b23e16e072adc60b9a61876455fa7b4 Mon Sep 17 00:00:00 2001 From: shuiyisong Date: Tue, 4 Jun 2024 20:03:27 +0800 Subject: [PATCH 16/16] chore: fmt --- src/frontend/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/frontend/Cargo.toml b/src/frontend/Cargo.toml index d5489028a916..56f4ab904ae2 100644 --- a/src/frontend/Cargo.toml +++ b/src/frontend/Cargo.toml @@ -66,10 +66,10 @@ tonic.workspace = true catalog = { workspace = true, features = ["testing"] } common-test-util.workspace = true datanode.workspace = true +datatypes.workspace = true futures = "0.3" meta-srv = { workspace = true, features = ["mock"] } +serde_json.workspace = true strfmt = "0.2" tower.workspace = true uuid.workspace = true -datatypes.workspace = true -serde_json.workspace = true