diff --git a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/README.md b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/README.md index 360e547..ac23efb 100644 --- a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/README.md +++ b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/README.md @@ -1,5 +1,5 @@ proto files are copied from -https://github.com/apache/spark/tree/5b2d2149b615acdd8730547a1f24c2b637222545/sql/connect/common/src/main/protobuf +https://github.com/apache/spark/tree/8a1f4acead0a580142152656913829700b710652/sql/connect/common/src/main/protobuf and with one additional change in each proto file ```patch diff --git a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/base.proto b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/base.proto index 0c3498a..38819f4 100644 --- a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/base.proto +++ b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/base.proto @@ -25,6 +25,7 @@ import "spark/connect/common.proto"; import "spark/connect/expressions.proto"; import "spark/connect/relations.proto"; import "spark/connect/types.proto"; +import "spark/connect/ml.proto"; option java_multiple_files = true; option java_package = "org.apache.kyuubi.shaded.spark.connect.proto"; @@ -94,6 +95,7 @@ message AnalyzePlanRequest { Persist persist = 14; Unpersist unpersist = 15; GetStorageLevel get_storage_level = 16; + JsonToDDL json_to_ddl = 18; } message Schema { @@ -199,6 +201,11 @@ message AnalyzePlanRequest { // (Required) The logical plan to get the storage level. Relation relation = 1; } + + message JsonToDDL { + // (Required) The JSON formatted string to be converted to DDL. + string json_string = 1; + } } // Response to performing analysis of the query. Contains relevant metadata to be able to @@ -224,6 +231,7 @@ message AnalyzePlanResponse { Persist persist = 12; Unpersist unpersist = 13; GetStorageLevel get_storage_level = 14; + JsonToDDL json_to_ddl = 16; } message Schema { @@ -275,6 +283,10 @@ message AnalyzePlanResponse { // (Required) The StorageLevel as a result of get_storage_level request. StorageLevel storage_level = 1; } + + message JsonToDDL { + string ddl_string = 1; + } } // A request to be executed by the service. @@ -384,6 +396,9 @@ message ExecutePlanResponse { // Response for command that checkpoints a DataFrame. CheckpointCommandResult checkpoint_command_result = 19; + // ML command response + MlCommandResult ml_command_result = 20; + // Support arbitrary result objects. google.protobuf.Any extension = 999; } @@ -514,6 +529,9 @@ message ConfigRequest { message Set { // (Required) The config key-value pairs to set. repeated KeyValue pairs = 1; + + // (Optional) Whether to ignore failures. + optional bool silent = 2; } message Get { @@ -522,7 +540,7 @@ message ConfigRequest { } message GetWithDefault { - // (Required) The config key-value paris to get. The value will be used as the default value. + // (Required) The config key-value pairs to get. The value will be used as the default value. repeated KeyValue pairs = 1; } @@ -913,6 +931,20 @@ message ReleaseSessionRequest { // can be used for language or version specific information and is only intended for // logging purposes and will not be interpreted by the server. optional string client_type = 3; + + // Signals the server to allow the client to reconnect to the session after it is released. + // + // By default, the server tombstones the session upon release, preventing reconnections and + // fully cleaning the session state. + // + // If this flag is set to true, the server may permit the client to reconnect to the session + // post-release, even if the session state has been cleaned. This can result in missing state, + // such as Temporary Views, Temporary UDFs, or the Current Catalog, in the reconnected session. + // + // Use this option sparingly and only when the client fully understands the implications of + // reconnecting to a released session. The client must ensure that any queries executed do not + // rely on the session state prior to its release. + bool allow_reconnect = 4; } // Next ID: 3 diff --git a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/commands.proto b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/commands.proto index f84e33e..bcd782d 100644 --- a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/commands.proto +++ b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/commands.proto @@ -21,6 +21,7 @@ import "google/protobuf/any.proto"; import "spark/connect/common.proto"; import "spark/connect/expressions.proto"; import "spark/connect/relations.proto"; +import "spark/connect/ml.proto"; package spark.connect; @@ -48,7 +49,7 @@ message Command { CheckpointCommand checkpoint_command = 14; RemoveCachedRemoteRelationCommand remove_cached_remote_relation_command = 15; MergeIntoTableCommand merge_into_table_command = 16; - + MlCommand ml_command = 17; // This field is used to mark extensions to the protocol. When plugins generate arbitrary // Commands they can add them here. During the planning the correct resolution is done. google.protobuf.Any extension = 999; @@ -507,6 +508,9 @@ message CheckpointCommand { // (Required) Whether to checkpoint this dataframe immediately. bool eager = 3; + + // (Optional) For local checkpoint, the storage level to use. + optional StorageLevel storage_level = 4; } message MergeIntoTableCommand { diff --git a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/common.proto b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/common.proto index e156aa4..c8e9f25 100644 --- a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/common.proto +++ b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/common.proto @@ -86,6 +86,7 @@ message Origin { // (Required) Indicate the origin type. oneof function { PythonOrigin python_origin = 1; + JvmOrigin jvm_origin = 2; } } @@ -96,3 +97,77 @@ message PythonOrigin { // (Required) Callsite to show to end users, for example, stacktrace. string call_site = 2; } + +message JvmOrigin { + // (Optional) Line number in the source file. + optional int32 line = 1; + + // (Optional) Start position in the source file. + optional int32 start_position = 2; + + // (Optional) Start index in the source file. + optional int32 start_index = 3; + + // (Optional) Stop index in the source file. + optional int32 stop_index = 4; + + // (Optional) SQL text. + optional string sql_text = 5; + + // (Optional) Object type. + optional string object_type = 6; + + // (Optional) Object name. + optional string object_name = 7; + + // (Optional) Stack trace. + repeated StackTraceElement stack_trace = 8; +} + +// A message to hold a [[java.lang.StackTraceElement]]. +message StackTraceElement { + // (Optional) Class loader name + optional string class_loader_name = 1; + + // (Optional) Module name + optional string module_name = 2; + + // (Optional) Module version + optional string module_version = 3; + + // (Required) Declaring class + string declaring_class = 4; + + // (Required) Method name + string method_name = 5; + + // (Optional) File name + optional string file_name = 6; + + // (Required) Line number + int32 line_number = 7; +} + +message Bools { + repeated bool values = 1; +} + +message Ints { + repeated int32 values = 1; +} + +message Longs { + repeated int64 values = 1; +} + +message Floats { + repeated float values = 1; +} + +message Doubles { + repeated double values = 1; +} + +message Strings { + repeated string values = 1; +} diff --git a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/expressions.proto b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/expressions.proto index a11b0c1..7b02a6b 100644 --- a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/expressions.proto +++ b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/expressions.proto @@ -52,6 +52,8 @@ message Expression { NamedArgumentExpression named_argument_expression = 17; MergeAction merge_action = 19; TypedAggregateExpression typed_aggregate_expression = 20; + LazyExpression lazy_expression = 21; + SubqueryExpression subquery_expression = 22; // This field is used to mark extensions to the protocol. When plugins generate arbitrary // relations they can add them here. During the planning the correct resolution is done. @@ -192,6 +194,8 @@ message Expression { Array array = 22; Map map = 23; Struct struct = 24; + + SpecializedArray specialized_array = 25; } message Decimal { @@ -226,6 +230,17 @@ message Expression { DataType struct_type = 1; repeated Literal elements = 2; } + + message SpecializedArray { + oneof value_type { + Bools bools = 1; + Ints ints = 2; + Longs longs = 3; + Floats floats = 4; + Doubles doubles = 5; + Strings strings = 6; + } + } } // An unresolved attribute that is not explicitly bound to a specific column, but the column @@ -259,6 +274,11 @@ message Expression { // When it is not a user defined function, Connect will use the function name directly. // When it is a user defined function, Connect will parse the function name first. bool is_user_defined_function = 4; + + // (Optional) Indicate if this function is defined in the internal function registry. + // If not set, the server will try to look up the function in the internal function registry + // and decide appropriately. + optional bool is_internal = 5; } // Expression as string. @@ -451,3 +471,22 @@ message MergeAction { Expression value = 2; } } + +message LazyExpression { + // (Required) The expression to be marked as lazy. + Expression child = 1; +} + +message SubqueryExpression { + // (Required) The id of corresponding connect plan. + int64 plan_id = 1; + + // (Required) The type of the subquery. + SubqueryType subquery_type = 2; + + enum SubqueryType { + SUBQUERY_TYPE_UNKNOWN = 0; + SUBQUERY_TYPE_SCALAR = 1; + SUBQUERY_TYPE_EXISTS = 2; + } +} diff --git a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/ml.proto b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/ml.proto new file mode 100644 index 0000000..231a7c3 --- /dev/null +++ b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/ml.proto @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = 'proto3'; + +package spark.connect; + +import "spark/connect/relations.proto"; +import "spark/connect/expressions.proto"; +import "spark/connect/ml_common.proto"; + +option java_multiple_files = true; +option java_package = "org.apache.kyuubi.shaded.spark.connect.proto"; +option go_package = "internal/generated"; + +// Command for ML +message MlCommand { + oneof command { + Fit fit = 1; + Fetch fetch = 2; + Delete delete = 3; + Write write = 4; + Read read = 5; + Evaluate evaluate = 6; + } + + // Command for estimator.fit(dataset) + message Fit { + // Estimator information + MlOperator estimator = 1; + // parameters of the Estimator + MlParams params = 2; + // the training dataset + Relation dataset = 3; + } + + // Command to delete the cached object which could be a model + // or summary evaluated by a model + message Delete { + ObjectRef obj_ref = 1; + } + + // Command to write ML operator + message Write { + // It could be an estimator/evaluator or the cached model + oneof type { + // Estimator or evaluator + MlOperator operator = 1; + // The cached model + ObjectRef obj_ref = 2; + } + // The parameters of operator which could be estimator/evaluator or a cached model + MlParams params = 3; + // Save the ML instance to the path + string path = 4; + // Overwrites if the output path already exists. + bool should_overwrite = 5; + // The options of the writer + map options = 6; + } + + // Command to load ML operator. + message Read { + // ML operator information + MlOperator operator = 1; + // Load the ML instance from the input path + string path = 2; + } + + // Command for evaluator.evaluate(dataset) + message Evaluate { + // Evaluator information + MlOperator evaluator = 1; + // parameters of the Evaluator + MlParams params = 2; + // the evaluating dataset + Relation dataset = 3; + } +} + +// The result of MlCommand +message MlCommandResult { + oneof result_type { + // The result of the attribute + Expression.Literal param = 1; + // Evaluate a Dataset in a model and return the cached ID of summary + string summary = 2; + // Operator information + MlOperatorInfo operator_info = 3; + } + + // Represents an operator info + message MlOperatorInfo { + oneof type { + // The cached object which could be a model or summary evaluated by a model + ObjectRef obj_ref = 1; + // Operator name + string name = 2; + } + string uid = 3; + MlParams params = 4; + } + +} diff --git a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/ml_common.proto b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/ml_common.proto new file mode 100644 index 0000000..fd10075 --- /dev/null +++ b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/ml_common.proto @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = 'proto3'; + +package spark.connect; + +import "spark/connect/expressions.proto"; + +option java_multiple_files = true; +option java_package = "org.apache.kyuubi.shaded.spark.connect.proto"; +option go_package = "internal/generated"; + +// MlParams stores param settings for ML Estimator / Transformer / Evaluator +message MlParams { + // User-supplied params + map params = 1; +} + +// MLOperator represents the ML operators like (Estimator, Transformer or Evaluator) +message MlOperator { + // The qualified name of the ML operator. + string name = 1; + // Unique id of the ML operator + string uid = 2; + // Represents what the ML operator is + OperatorType type = 3; + enum OperatorType { + UNSPECIFIED = 0; + ESTIMATOR = 1; + TRANSFORMER = 2; + EVALUATOR = 3; + MODEL = 4; + } +} + +// Represents a reference to the cached object which could be a model +// or summary evaluated by a model +message ObjectRef { + // The ID is used to lookup the object on the server side. + string id = 1; +} diff --git a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/relations.proto b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/relations.proto index 1535776..b401f7f 100644 --- a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/relations.proto +++ b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/relations.proto @@ -24,6 +24,7 @@ import "spark/connect/expressions.proto"; import "spark/connect/types.proto"; import "spark/connect/catalog.proto"; import "spark/connect/common.proto"; +import "spark/connect/ml_common.proto"; option java_multiple_files = true; option java_package = "org.apache.kyuubi.shaded.spark.connect.proto"; @@ -76,6 +77,9 @@ message Relation { AsOfJoin as_of_join = 39; CommonInlineUserDefinedDataSource common_inline_user_defined_data_source = 40; WithRelations with_relations = 41; + Transpose transpose = 42; + UnresolvedTableValuedFunction unresolved_table_valued_function = 43; + LateralJoin lateral_join = 44; // NA functions NAFill fill_na = 90; @@ -95,6 +99,9 @@ message Relation { // Catalog API (experimental / unstable) Catalog catalog = 200; + // ML relation + MlRelation ml_relation = 300; + // This field is used to mark extensions to the protocol. When plugins generate arbitrary // relations they can add them here. During the planning the correct resolution is done. google.protobuf.Any extension = 998; @@ -102,6 +109,55 @@ message Relation { } } +// Relation to represent ML world +message MlRelation { + oneof ml_type { + Transform transform = 1; + Fetch fetch = 2; + } + // Relation to represent transform(input) of the operator + // which could be a cached model or a new transformer + message Transform { + oneof operator { + // Object reference + ObjectRef obj_ref = 1; + // Could be an ML transformer like VectorAssembler + MlOperator transformer = 2; + } + // the input dataframe + Relation input = 3; + // the operator specific parameters + MlParams params = 4; + } +} + +// Message for fetching attribute from object on the server side. +// Fetch can be represented as a Relation or a ML command +// Command: model.coefficients, model.summary.weightedPrecision which +// returns the final literal result +// Relation: model.summary.roc which returns a DataFrame (Relation) +message Fetch { + // (Required) reference to the object on the server side + ObjectRef obj_ref = 1; + // (Required) the calling method chains + repeated Method methods = 2; + + // Represents a method with inclusion of method name and its arguments + message Method { + // (Required) the method name + string method = 1; + // (Optional) the arguments of the method + repeated Args args = 2; + + message Args { + oneof args_type { + Expression.Literal param = 1; + Relation input = 2; + } + } + } +} + // Used for testing purposes only. message Unknown {} @@ -889,6 +945,26 @@ message Unpivot { } } +// Transpose a DataFrame, switching rows to columns. +// Transforms the DataFrame such that the values in the specified index column +// become the new columns of the DataFrame. +message Transpose { + // (Required) The input relation. + Relation input = 1; + + // (Optional) A list of columns that will be treated as the indices. + // Only single column is supported now. + repeated Expression index_columns = 2; +} + +message UnresolvedTableValuedFunction { + // (Required) name (or unparsed name for user defined function) for the unresolved function. + string function_name = 1; + + // (Optional) Function arguments. Empty arguments are allowed. + repeated Expression arguments = 2; +} + message ToSchema { // (Required) The input relation. Relation input = 1; @@ -952,6 +1028,9 @@ message GroupMap { // (Optional) Timeout configuration for groups that do not receive data for a while. optional string timeout_conf = 9; + + // (Optional) The schema for the grouped state. + optional DataType state_schema = 10; } message CoGroupMap { @@ -1118,3 +1197,20 @@ message AsOfJoin { // (Required) Whether to search for prior, subsequent, or closest matches. string direction = 10; } + +// Relation of type [[LateralJoin]]. +// +// `left` and `right` must be present. +message LateralJoin { + // (Required) Left input relation for a Join. + Relation left = 1; + + // (Required) Right input relation for a Join. + Relation right = 2; + + // (Optional) The join condition. + Expression join_condition = 3; + + // (Required) The join type. + Join.JoinType join_type = 4; +} diff --git a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/types.proto b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/types.proto index d6b4ffb..6b4ee84 100644 --- a/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/types.proto +++ b/kyuubi-relocated-spark-connect-rpc/src/main/protobuf/spark/connect/types.proto @@ -188,10 +188,14 @@ message DataType { message UDT { string type = 1; + // Required for Scala/Java UDT optional string jvm_class = 2; + // Required for Python UDT optional string python_class = 3; + // Required for Python UDT optional string serialized_python_class = 4; - DataType sql_type = 5; + // Required for Python UDT + optional DataType sql_type = 5; } message Unparsed {