diff --git a/connector/connect/common/src/main/protobuf/spark/connect/expressions.proto b/connector/connect/common/src/main/protobuf/spark/connect/expressions.proto index 4aac2bcc612b..782bcc5d1fcd 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/expressions.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/expressions.proto @@ -48,6 +48,7 @@ message Expression { CommonInlineUserDefinedFunction common_inline_user_defined_function = 15; CallFunction call_function = 16; NamedArgumentExpression named_argument_expression = 17; + GetColumnByOrdinal get_column_by_ordinal = 18; // This field is used to mark extensions to the protocol. When plugins generate arbitrary // relations they can add them here. During the planning the correct resolution is done. @@ -228,6 +229,15 @@ message Expression { optional bool is_metadata_column = 3; } + // An unresolved attribute that is represented by its column index. + message GetColumnByOrdinal { + // (Required) 0-based column index. + int32 ordinal = 1; + + // (Optional) The id of corresponding connect plan. + optional int64 plan_id = 2; + } + // An unresolved function is not explicitly bound to one explicit function, but the function // is resolved during analysis following Sparks name resolution rules. message UnresolvedFunction { diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index b8ab5539b30f..b88dfe35be25 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -45,7 +45,7 @@ import org.apache.spark.ml.{functions => MLFunctions} import org.apache.spark.sql.{Column, Dataset, Encoders, ForeachWriter, RelationalGroupedDataset, SparkSession} import org.apache.spark.sql.avro.{AvroDataToCatalyst, CatalystDataToAvro} import org.apache.spark.sql.catalyst.{expressions, AliasIdentifier, FunctionIdentifier} -import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, MultiAlias, NameParameterizedQuery, PosParameterizedQuery, UnresolvedAlias, UnresolvedAttribute, UnresolvedDeserializer, UnresolvedExtractValue, UnresolvedFunction, UnresolvedRegex, UnresolvedRelation, UnresolvedStar} +import org.apache.spark.sql.catalyst.analysis.{GetColumnByOrdinal, GlobalTempView, LocalTempView, MultiAlias, NameParameterizedQuery, PosParameterizedQuery, UnresolvedAlias, UnresolvedAttribute, UnresolvedDeserializer, UnresolvedExtractValue, UnresolvedFunction, UnresolvedRegex, UnresolvedRelation, UnresolvedStar} import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, ExpressionEncoder, RowEncoder} import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.UnboundRowEncoder import org.apache.spark.sql.catalyst.expressions._ @@ -1344,6 +1344,8 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging { case proto.Expression.ExprTypeCase.LITERAL => transformLiteral(exp.getLiteral) case proto.Expression.ExprTypeCase.UNRESOLVED_ATTRIBUTE => transformUnresolvedAttribute(exp.getUnresolvedAttribute) + case proto.Expression.ExprTypeCase.GET_COLUMN_BY_ORDINAL => + transformGetColumnByOrdinal(exp.getGetColumnByOrdinal) case proto.Expression.ExprTypeCase.UNRESOLVED_FUNCTION => transformUnregisteredFunction(exp.getUnresolvedFunction) .getOrElse(transformUnresolvedFunction(exp.getUnresolvedFunction)) @@ -1397,6 +1399,16 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging { expr } + private def transformGetColumnByOrdinal( + attr: proto.Expression.GetColumnByOrdinal): GetColumnByOrdinal = { + // always set dataType field null, since it is not used in Analyzer + val expr = GetColumnByOrdinal(attr.getOrdinal, null) + if (attr.hasPlanId) { + expr.setTagValue(LogicalPlan.PLAN_ID_TAG, attr.getPlanId) + } + expr + } + private def transformExpressionPlugin(extension: ProtoAny): Expression = { SparkConnectPluginRegistry.expressionRegistry // Lazily traverse the collection. diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py index 639a60a0fb33..d7fe209e415b 100644 --- a/python/pyspark/sql/connect/dataframe.py +++ b/python/pyspark/sql/connect/dataframe.py @@ -68,7 +68,10 @@ from pyspark.sql.connect.readwriter import DataFrameWriter, DataFrameWriterV2 from pyspark.sql.connect.streaming.readwriter import DataStreamWriter from pyspark.sql.connect.column import Column -from pyspark.sql.connect.expressions import UnresolvedRegex +from pyspark.sql.connect.expressions import ( + UnresolvedRegex, + GetColumnByOrdinal, +) from pyspark.sql.connect.functions import ( _to_col_with_plan_id, _to_col, @@ -1654,10 +1657,10 @@ def __getitem__(self, item: Union[Column, List, Tuple]) -> "DataFrame": ... def __getitem__(self, item: Union[int, str, Column, List, Tuple]) -> Union[Column, "DataFrame"]: - if isinstance(item, str): - if self._plan is None: - raise SparkConnectException("Cannot analyze on empty plan.") + if self._plan is None: + raise SparkConnectException("Cannot analyze on empty plan.") + if isinstance(item, str): # validate the column name if not hasattr(self._session, "is_mock_session"): self.select(item).isLocal() @@ -1671,7 +1674,15 @@ def __getitem__(self, item: Union[int, str, Column, List, Tuple]) -> Union[Colum elif isinstance(item, (list, tuple)): return self.select(*item) elif isinstance(item, int): - return col(self.columns[item]) + n = len(self.columns) + # 1, convert bool; 2, covert negative index; 3, validate index + item = range(0, n)[int(item)] + return Column( + GetColumnByOrdinal( + ordinal=item, + plan_id=self._plan._plan_id, + ) + ) else: raise PySparkTypeError( error_class="NOT_COLUMN_OR_INT_OR_LIST_OR_STR_OR_TUPLE", diff --git a/python/pyspark/sql/connect/expressions.py b/python/pyspark/sql/connect/expressions.py index 34aa4da11179..3174661b4bf6 100644 --- a/python/pyspark/sql/connect/expressions.py +++ b/python/pyspark/sql/connect/expressions.py @@ -488,6 +488,40 @@ def __eq__(self, other: Any) -> bool: ) +class GetColumnByOrdinal(Expression): + """Represents a column index (0-based). There is no guarantee that this column + actually exists. In the context of this project, we refer by its index and + treat it as an unresolved GetColumnByOrdinal""" + + def __init__(self, ordinal: int, plan_id: Optional[int] = None): + super().__init__() + + assert isinstance(ordinal, int) and ordinal >= 0 + self._ordinal = ordinal + + assert plan_id is None or isinstance(plan_id, int) + self._plan_id = plan_id + + def to_plan(self, session: "SparkConnectClient") -> proto.Expression: + """Returns the Proto representation of the expression.""" + expr = proto.Expression() + expr.get_column_by_ordinal.ordinal = self._ordinal + if self._plan_id is not None: + expr.get_column_by_ordinal.plan_id = self._plan_id + return expr + + def __repr__(self) -> str: + return f"getcolumnbyordinal({self._ordinal})" + + def __eq__(self, other: Any) -> bool: + return ( + other is not None + and isinstance(other, GetColumnByOrdinal) + and other._ordinal == self._ordinal + and other._plan_id == self._plan_id + ) + + class UnresolvedStar(Expression): def __init__(self, unparsed_target: Optional[str]): super().__init__() diff --git a/python/pyspark/sql/connect/proto/expressions_pb2.py b/python/pyspark/sql/connect/proto/expressions_pb2.py index eb125fab39cb..6d7a4d3bdcd6 100644 --- a/python/pyspark/sql/connect/proto/expressions_pb2.py +++ b/python/pyspark/sql/connect/proto/expressions_pb2.py @@ -33,7 +33,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1fspark/connect/expressions.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x19spark/connect/types.proto"\x8a-\n\nExpression\x12=\n\x07literal\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralH\x00R\x07literal\x12\x62\n\x14unresolved_attribute\x18\x02 \x01(\x0b\x32-.spark.connect.Expression.UnresolvedAttributeH\x00R\x13unresolvedAttribute\x12_\n\x13unresolved_function\x18\x03 \x01(\x0b\x32,.spark.connect.Expression.UnresolvedFunctionH\x00R\x12unresolvedFunction\x12Y\n\x11\x65xpression_string\x18\x04 \x01(\x0b\x32*.spark.connect.Expression.ExpressionStringH\x00R\x10\x65xpressionString\x12S\n\x0funresolved_star\x18\x05 \x01(\x0b\x32(.spark.connect.Expression.UnresolvedStarH\x00R\x0eunresolvedStar\x12\x37\n\x05\x61lias\x18\x06 \x01(\x0b\x32\x1f.spark.connect.Expression.AliasH\x00R\x05\x61lias\x12\x34\n\x04\x63\x61st\x18\x07 \x01(\x0b\x32\x1e.spark.connect.Expression.CastH\x00R\x04\x63\x61st\x12V\n\x10unresolved_regex\x18\x08 \x01(\x0b\x32).spark.connect.Expression.UnresolvedRegexH\x00R\x0funresolvedRegex\x12\x44\n\nsort_order\x18\t \x01(\x0b\x32#.spark.connect.Expression.SortOrderH\x00R\tsortOrder\x12S\n\x0flambda_function\x18\n \x01(\x0b\x32(.spark.connect.Expression.LambdaFunctionH\x00R\x0elambdaFunction\x12:\n\x06window\x18\x0b \x01(\x0b\x32 .spark.connect.Expression.WindowH\x00R\x06window\x12l\n\x18unresolved_extract_value\x18\x0c \x01(\x0b\x32\x30.spark.connect.Expression.UnresolvedExtractValueH\x00R\x16unresolvedExtractValue\x12M\n\rupdate_fields\x18\r \x01(\x0b\x32&.spark.connect.Expression.UpdateFieldsH\x00R\x0cupdateFields\x12\x82\x01\n unresolved_named_lambda_variable\x18\x0e \x01(\x0b\x32\x37.spark.connect.Expression.UnresolvedNamedLambdaVariableH\x00R\x1dunresolvedNamedLambdaVariable\x12~\n#common_inline_user_defined_function\x18\x0f \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x1f\x63ommonInlineUserDefinedFunction\x12\x42\n\rcall_function\x18\x10 \x01(\x0b\x32\x1b.spark.connect.CallFunctionH\x00R\x0c\x63\x61llFunction\x12\x64\n\x19named_argument_expression\x18\x11 \x01(\x0b\x32&.spark.connect.NamedArgumentExpressionH\x00R\x17namedArgumentExpression\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x1a\x8f\x06\n\x06Window\x12\x42\n\x0fwindow_function\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0ewindowFunction\x12@\n\x0epartition_spec\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\rpartitionSpec\x12\x42\n\norder_spec\x18\x03 \x03(\x0b\x32#.spark.connect.Expression.SortOrderR\torderSpec\x12K\n\nframe_spec\x18\x04 \x01(\x0b\x32,.spark.connect.Expression.Window.WindowFrameR\tframeSpec\x1a\xed\x03\n\x0bWindowFrame\x12U\n\nframe_type\x18\x01 \x01(\x0e\x32\x36.spark.connect.Expression.Window.WindowFrame.FrameTypeR\tframeType\x12P\n\x05lower\x18\x02 \x01(\x0b\x32:.spark.connect.Expression.Window.WindowFrame.FrameBoundaryR\x05lower\x12P\n\x05upper\x18\x03 \x01(\x0b\x32:.spark.connect.Expression.Window.WindowFrame.FrameBoundaryR\x05upper\x1a\x91\x01\n\rFrameBoundary\x12!\n\x0b\x63urrent_row\x18\x01 \x01(\x08H\x00R\ncurrentRow\x12\x1e\n\tunbounded\x18\x02 \x01(\x08H\x00R\tunbounded\x12\x31\n\x05value\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionH\x00R\x05valueB\n\n\x08\x62oundary"O\n\tFrameType\x12\x18\n\x14\x46RAME_TYPE_UNDEFINED\x10\x00\x12\x12\n\x0e\x46RAME_TYPE_ROW\x10\x01\x12\x14\n\x10\x46RAME_TYPE_RANGE\x10\x02\x1a\xa9\x03\n\tSortOrder\x12/\n\x05\x63hild\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05\x63hild\x12O\n\tdirection\x18\x02 \x01(\x0e\x32\x31.spark.connect.Expression.SortOrder.SortDirectionR\tdirection\x12U\n\rnull_ordering\x18\x03 \x01(\x0e\x32\x30.spark.connect.Expression.SortOrder.NullOrderingR\x0cnullOrdering"l\n\rSortDirection\x12\x1e\n\x1aSORT_DIRECTION_UNSPECIFIED\x10\x00\x12\x1c\n\x18SORT_DIRECTION_ASCENDING\x10\x01\x12\x1d\n\x19SORT_DIRECTION_DESCENDING\x10\x02"U\n\x0cNullOrdering\x12\x1a\n\x16SORT_NULLS_UNSPECIFIED\x10\x00\x12\x14\n\x10SORT_NULLS_FIRST\x10\x01\x12\x13\n\x0fSORT_NULLS_LAST\x10\x02\x1a\x91\x01\n\x04\x43\x61st\x12-\n\x04\x65xpr\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x04\x65xpr\x12-\n\x04type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x04type\x12\x1b\n\x08type_str\x18\x03 \x01(\tH\x00R\x07typeStrB\x0e\n\x0c\x63\x61st_to_type\x1a\x9b\x0c\n\x07Literal\x12-\n\x04null\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x04null\x12\x18\n\x06\x62inary\x18\x02 \x01(\x0cH\x00R\x06\x62inary\x12\x1a\n\x07\x62oolean\x18\x03 \x01(\x08H\x00R\x07\x62oolean\x12\x14\n\x04\x62yte\x18\x04 \x01(\x05H\x00R\x04\x62yte\x12\x16\n\x05short\x18\x05 \x01(\x05H\x00R\x05short\x12\x1a\n\x07integer\x18\x06 \x01(\x05H\x00R\x07integer\x12\x14\n\x04long\x18\x07 \x01(\x03H\x00R\x04long\x12\x16\n\x05\x66loat\x18\n \x01(\x02H\x00R\x05\x66loat\x12\x18\n\x06\x64ouble\x18\x0b \x01(\x01H\x00R\x06\x64ouble\x12\x45\n\x07\x64\x65\x63imal\x18\x0c \x01(\x0b\x32).spark.connect.Expression.Literal.DecimalH\x00R\x07\x64\x65\x63imal\x12\x18\n\x06string\x18\r \x01(\tH\x00R\x06string\x12\x14\n\x04\x64\x61te\x18\x10 \x01(\x05H\x00R\x04\x64\x61te\x12\x1e\n\ttimestamp\x18\x11 \x01(\x03H\x00R\ttimestamp\x12%\n\rtimestamp_ntz\x18\x12 \x01(\x03H\x00R\x0ctimestampNtz\x12\x61\n\x11\x63\x61lendar_interval\x18\x13 \x01(\x0b\x32\x32.spark.connect.Expression.Literal.CalendarIntervalH\x00R\x10\x63\x61lendarInterval\x12\x30\n\x13year_month_interval\x18\x14 \x01(\x05H\x00R\x11yearMonthInterval\x12,\n\x11\x64\x61y_time_interval\x18\x15 \x01(\x03H\x00R\x0f\x64\x61yTimeInterval\x12?\n\x05\x61rray\x18\x16 \x01(\x0b\x32\'.spark.connect.Expression.Literal.ArrayH\x00R\x05\x61rray\x12\x39\n\x03map\x18\x17 \x01(\x0b\x32%.spark.connect.Expression.Literal.MapH\x00R\x03map\x12\x42\n\x06struct\x18\x18 \x01(\x0b\x32(.spark.connect.Expression.Literal.StructH\x00R\x06struct\x1au\n\x07\x44\x65\x63imal\x12\x14\n\x05value\x18\x01 \x01(\tR\x05value\x12!\n\tprecision\x18\x02 \x01(\x05H\x00R\tprecision\x88\x01\x01\x12\x19\n\x05scale\x18\x03 \x01(\x05H\x01R\x05scale\x88\x01\x01\x42\x0c\n\n_precisionB\x08\n\x06_scale\x1a\x62\n\x10\x43\x61lendarInterval\x12\x16\n\x06months\x18\x01 \x01(\x05R\x06months\x12\x12\n\x04\x64\x61ys\x18\x02 \x01(\x05R\x04\x64\x61ys\x12"\n\x0cmicroseconds\x18\x03 \x01(\x03R\x0cmicroseconds\x1a\x82\x01\n\x05\x41rray\x12:\n\x0c\x65lement_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x0b\x65lementType\x12=\n\x08\x65lements\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x08\x65lements\x1a\xe3\x01\n\x03Map\x12\x32\n\x08key_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x07keyType\x12\x36\n\nvalue_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\tvalueType\x12\x35\n\x04keys\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x04keys\x12\x39\n\x06values\x18\x04 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x1a\x81\x01\n\x06Struct\x12\x38\n\x0bstruct_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\nstructType\x12=\n\x08\x65lements\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x08\x65lementsB\x0e\n\x0cliteral_type\x1a\xba\x01\n\x13UnresolvedAttribute\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x12\x31\n\x12is_metadata_column\x18\x03 \x01(\x08H\x01R\x10isMetadataColumn\x88\x01\x01\x42\n\n\x08_plan_idB\x15\n\x13_is_metadata_column\x1a\xcc\x01\n\x12UnresolvedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12\x1f\n\x0bis_distinct\x18\x03 \x01(\x08R\nisDistinct\x12\x37\n\x18is_user_defined_function\x18\x04 \x01(\x08R\x15isUserDefinedFunction\x1a\x32\n\x10\x45xpressionString\x12\x1e\n\nexpression\x18\x01 \x01(\tR\nexpression\x1aR\n\x0eUnresolvedStar\x12,\n\x0funparsed_target\x18\x01 \x01(\tH\x00R\x0eunparsedTarget\x88\x01\x01\x42\x12\n\x10_unparsed_target\x1aV\n\x0fUnresolvedRegex\x12\x19\n\x08\x63ol_name\x18\x01 \x01(\tR\x07\x63olName\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x42\n\n\x08_plan_id\x1a\x84\x01\n\x16UnresolvedExtractValue\x12/\n\x05\x63hild\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05\x63hild\x12\x39\n\nextraction\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\nextraction\x1a\xbb\x01\n\x0cUpdateFields\x12\x46\n\x11struct_expression\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x10structExpression\x12\x1d\n\nfield_name\x18\x02 \x01(\tR\tfieldName\x12\x44\n\x10value_expression\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0fvalueExpression\x1ax\n\x05\x41lias\x12-\n\x04\x65xpr\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x04\x65xpr\x12\x12\n\x04name\x18\x02 \x03(\tR\x04name\x12\x1f\n\x08metadata\x18\x03 \x01(\tH\x00R\x08metadata\x88\x01\x01\x42\x0b\n\t_metadata\x1a\x9e\x01\n\x0eLambdaFunction\x12\x35\n\x08\x66unction\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08\x66unction\x12U\n\targuments\x18\x02 \x03(\x0b\x32\x37.spark.connect.Expression.UnresolvedNamedLambdaVariableR\targuments\x1a>\n\x1dUnresolvedNamedLambdaVariable\x12\x1d\n\nname_parts\x18\x01 \x03(\tR\tnamePartsB\x0b\n\texpr_type"\xec\x02\n\x1f\x43ommonInlineUserDefinedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12$\n\rdeterministic\x18\x02 \x01(\x08R\rdeterministic\x12\x37\n\targuments\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12\x39\n\npython_udf\x18\x04 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\tpythonUdf\x12I\n\x10scalar_scala_udf\x18\x05 \x01(\x0b\x32\x1d.spark.connect.ScalarScalaUDFH\x00R\x0escalarScalaUdf\x12\x33\n\x08java_udf\x18\x06 \x01(\x0b\x32\x16.spark.connect.JavaUDFH\x00R\x07javaUdfB\n\n\x08\x66unction"\x9b\x01\n\tPythonUDF\x12\x38\n\x0boutput_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\noutputType\x12\x1b\n\teval_type\x18\x02 \x01(\x05R\x08\x65valType\x12\x18\n\x07\x63ommand\x18\x03 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x04 \x01(\tR\tpythonVer"\xb8\x01\n\x0eScalarScalaUDF\x12\x18\n\x07payload\x18\x01 \x01(\x0cR\x07payload\x12\x37\n\ninputTypes\x18\x02 \x03(\x0b\x32\x17.spark.connect.DataTypeR\ninputTypes\x12\x37\n\noutputType\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeR\noutputType\x12\x1a\n\x08nullable\x18\x04 \x01(\x08R\x08nullable"\x95\x01\n\x07JavaUDF\x12\x1d\n\nclass_name\x18\x01 \x01(\tR\tclassName\x12=\n\x0boutput_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\noutputType\x88\x01\x01\x12\x1c\n\taggregate\x18\x03 \x01(\x08R\taggregateB\x0e\n\x0c_output_type"l\n\x0c\x43\x61llFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments"\\\n\x17NamedArgumentExpression\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05valueB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' + b'\n\x1fspark/connect/expressions.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x19spark/connect/types.proto"\xc7.\n\nExpression\x12=\n\x07literal\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralH\x00R\x07literal\x12\x62\n\x14unresolved_attribute\x18\x02 \x01(\x0b\x32-.spark.connect.Expression.UnresolvedAttributeH\x00R\x13unresolvedAttribute\x12_\n\x13unresolved_function\x18\x03 \x01(\x0b\x32,.spark.connect.Expression.UnresolvedFunctionH\x00R\x12unresolvedFunction\x12Y\n\x11\x65xpression_string\x18\x04 \x01(\x0b\x32*.spark.connect.Expression.ExpressionStringH\x00R\x10\x65xpressionString\x12S\n\x0funresolved_star\x18\x05 \x01(\x0b\x32(.spark.connect.Expression.UnresolvedStarH\x00R\x0eunresolvedStar\x12\x37\n\x05\x61lias\x18\x06 \x01(\x0b\x32\x1f.spark.connect.Expression.AliasH\x00R\x05\x61lias\x12\x34\n\x04\x63\x61st\x18\x07 \x01(\x0b\x32\x1e.spark.connect.Expression.CastH\x00R\x04\x63\x61st\x12V\n\x10unresolved_regex\x18\x08 \x01(\x0b\x32).spark.connect.Expression.UnresolvedRegexH\x00R\x0funresolvedRegex\x12\x44\n\nsort_order\x18\t \x01(\x0b\x32#.spark.connect.Expression.SortOrderH\x00R\tsortOrder\x12S\n\x0flambda_function\x18\n \x01(\x0b\x32(.spark.connect.Expression.LambdaFunctionH\x00R\x0elambdaFunction\x12:\n\x06window\x18\x0b \x01(\x0b\x32 .spark.connect.Expression.WindowH\x00R\x06window\x12l\n\x18unresolved_extract_value\x18\x0c \x01(\x0b\x32\x30.spark.connect.Expression.UnresolvedExtractValueH\x00R\x16unresolvedExtractValue\x12M\n\rupdate_fields\x18\r \x01(\x0b\x32&.spark.connect.Expression.UpdateFieldsH\x00R\x0cupdateFields\x12\x82\x01\n unresolved_named_lambda_variable\x18\x0e \x01(\x0b\x32\x37.spark.connect.Expression.UnresolvedNamedLambdaVariableH\x00R\x1dunresolvedNamedLambdaVariable\x12~\n#common_inline_user_defined_function\x18\x0f \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x1f\x63ommonInlineUserDefinedFunction\x12\x42\n\rcall_function\x18\x10 \x01(\x0b\x32\x1b.spark.connect.CallFunctionH\x00R\x0c\x63\x61llFunction\x12\x64\n\x19named_argument_expression\x18\x11 \x01(\x0b\x32&.spark.connect.NamedArgumentExpressionH\x00R\x17namedArgumentExpression\x12\x61\n\x15get_column_by_ordinal\x18\x12 \x01(\x0b\x32,.spark.connect.Expression.GetColumnByOrdinalH\x00R\x12getColumnByOrdinal\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x1a\x8f\x06\n\x06Window\x12\x42\n\x0fwindow_function\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0ewindowFunction\x12@\n\x0epartition_spec\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\rpartitionSpec\x12\x42\n\norder_spec\x18\x03 \x03(\x0b\x32#.spark.connect.Expression.SortOrderR\torderSpec\x12K\n\nframe_spec\x18\x04 \x01(\x0b\x32,.spark.connect.Expression.Window.WindowFrameR\tframeSpec\x1a\xed\x03\n\x0bWindowFrame\x12U\n\nframe_type\x18\x01 \x01(\x0e\x32\x36.spark.connect.Expression.Window.WindowFrame.FrameTypeR\tframeType\x12P\n\x05lower\x18\x02 \x01(\x0b\x32:.spark.connect.Expression.Window.WindowFrame.FrameBoundaryR\x05lower\x12P\n\x05upper\x18\x03 \x01(\x0b\x32:.spark.connect.Expression.Window.WindowFrame.FrameBoundaryR\x05upper\x1a\x91\x01\n\rFrameBoundary\x12!\n\x0b\x63urrent_row\x18\x01 \x01(\x08H\x00R\ncurrentRow\x12\x1e\n\tunbounded\x18\x02 \x01(\x08H\x00R\tunbounded\x12\x31\n\x05value\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionH\x00R\x05valueB\n\n\x08\x62oundary"O\n\tFrameType\x12\x18\n\x14\x46RAME_TYPE_UNDEFINED\x10\x00\x12\x12\n\x0e\x46RAME_TYPE_ROW\x10\x01\x12\x14\n\x10\x46RAME_TYPE_RANGE\x10\x02\x1a\xa9\x03\n\tSortOrder\x12/\n\x05\x63hild\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05\x63hild\x12O\n\tdirection\x18\x02 \x01(\x0e\x32\x31.spark.connect.Expression.SortOrder.SortDirectionR\tdirection\x12U\n\rnull_ordering\x18\x03 \x01(\x0e\x32\x30.spark.connect.Expression.SortOrder.NullOrderingR\x0cnullOrdering"l\n\rSortDirection\x12\x1e\n\x1aSORT_DIRECTION_UNSPECIFIED\x10\x00\x12\x1c\n\x18SORT_DIRECTION_ASCENDING\x10\x01\x12\x1d\n\x19SORT_DIRECTION_DESCENDING\x10\x02"U\n\x0cNullOrdering\x12\x1a\n\x16SORT_NULLS_UNSPECIFIED\x10\x00\x12\x14\n\x10SORT_NULLS_FIRST\x10\x01\x12\x13\n\x0fSORT_NULLS_LAST\x10\x02\x1a\x91\x01\n\x04\x43\x61st\x12-\n\x04\x65xpr\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x04\x65xpr\x12-\n\x04type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x04type\x12\x1b\n\x08type_str\x18\x03 \x01(\tH\x00R\x07typeStrB\x0e\n\x0c\x63\x61st_to_type\x1a\x9b\x0c\n\x07Literal\x12-\n\x04null\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x04null\x12\x18\n\x06\x62inary\x18\x02 \x01(\x0cH\x00R\x06\x62inary\x12\x1a\n\x07\x62oolean\x18\x03 \x01(\x08H\x00R\x07\x62oolean\x12\x14\n\x04\x62yte\x18\x04 \x01(\x05H\x00R\x04\x62yte\x12\x16\n\x05short\x18\x05 \x01(\x05H\x00R\x05short\x12\x1a\n\x07integer\x18\x06 \x01(\x05H\x00R\x07integer\x12\x14\n\x04long\x18\x07 \x01(\x03H\x00R\x04long\x12\x16\n\x05\x66loat\x18\n \x01(\x02H\x00R\x05\x66loat\x12\x18\n\x06\x64ouble\x18\x0b \x01(\x01H\x00R\x06\x64ouble\x12\x45\n\x07\x64\x65\x63imal\x18\x0c \x01(\x0b\x32).spark.connect.Expression.Literal.DecimalH\x00R\x07\x64\x65\x63imal\x12\x18\n\x06string\x18\r \x01(\tH\x00R\x06string\x12\x14\n\x04\x64\x61te\x18\x10 \x01(\x05H\x00R\x04\x64\x61te\x12\x1e\n\ttimestamp\x18\x11 \x01(\x03H\x00R\ttimestamp\x12%\n\rtimestamp_ntz\x18\x12 \x01(\x03H\x00R\x0ctimestampNtz\x12\x61\n\x11\x63\x61lendar_interval\x18\x13 \x01(\x0b\x32\x32.spark.connect.Expression.Literal.CalendarIntervalH\x00R\x10\x63\x61lendarInterval\x12\x30\n\x13year_month_interval\x18\x14 \x01(\x05H\x00R\x11yearMonthInterval\x12,\n\x11\x64\x61y_time_interval\x18\x15 \x01(\x03H\x00R\x0f\x64\x61yTimeInterval\x12?\n\x05\x61rray\x18\x16 \x01(\x0b\x32\'.spark.connect.Expression.Literal.ArrayH\x00R\x05\x61rray\x12\x39\n\x03map\x18\x17 \x01(\x0b\x32%.spark.connect.Expression.Literal.MapH\x00R\x03map\x12\x42\n\x06struct\x18\x18 \x01(\x0b\x32(.spark.connect.Expression.Literal.StructH\x00R\x06struct\x1au\n\x07\x44\x65\x63imal\x12\x14\n\x05value\x18\x01 \x01(\tR\x05value\x12!\n\tprecision\x18\x02 \x01(\x05H\x00R\tprecision\x88\x01\x01\x12\x19\n\x05scale\x18\x03 \x01(\x05H\x01R\x05scale\x88\x01\x01\x42\x0c\n\n_precisionB\x08\n\x06_scale\x1a\x62\n\x10\x43\x61lendarInterval\x12\x16\n\x06months\x18\x01 \x01(\x05R\x06months\x12\x12\n\x04\x64\x61ys\x18\x02 \x01(\x05R\x04\x64\x61ys\x12"\n\x0cmicroseconds\x18\x03 \x01(\x03R\x0cmicroseconds\x1a\x82\x01\n\x05\x41rray\x12:\n\x0c\x65lement_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x0b\x65lementType\x12=\n\x08\x65lements\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x08\x65lements\x1a\xe3\x01\n\x03Map\x12\x32\n\x08key_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x07keyType\x12\x36\n\nvalue_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\tvalueType\x12\x35\n\x04keys\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x04keys\x12\x39\n\x06values\x18\x04 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x1a\x81\x01\n\x06Struct\x12\x38\n\x0bstruct_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\nstructType\x12=\n\x08\x65lements\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x08\x65lementsB\x0e\n\x0cliteral_type\x1a\xba\x01\n\x13UnresolvedAttribute\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x12\x31\n\x12is_metadata_column\x18\x03 \x01(\x08H\x01R\x10isMetadataColumn\x88\x01\x01\x42\n\n\x08_plan_idB\x15\n\x13_is_metadata_column\x1aX\n\x12GetColumnByOrdinal\x12\x18\n\x07ordinal\x18\x01 \x01(\x05R\x07ordinal\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x42\n\n\x08_plan_id\x1a\xcc\x01\n\x12UnresolvedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12\x1f\n\x0bis_distinct\x18\x03 \x01(\x08R\nisDistinct\x12\x37\n\x18is_user_defined_function\x18\x04 \x01(\x08R\x15isUserDefinedFunction\x1a\x32\n\x10\x45xpressionString\x12\x1e\n\nexpression\x18\x01 \x01(\tR\nexpression\x1aR\n\x0eUnresolvedStar\x12,\n\x0funparsed_target\x18\x01 \x01(\tH\x00R\x0eunparsedTarget\x88\x01\x01\x42\x12\n\x10_unparsed_target\x1aV\n\x0fUnresolvedRegex\x12\x19\n\x08\x63ol_name\x18\x01 \x01(\tR\x07\x63olName\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x42\n\n\x08_plan_id\x1a\x84\x01\n\x16UnresolvedExtractValue\x12/\n\x05\x63hild\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05\x63hild\x12\x39\n\nextraction\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\nextraction\x1a\xbb\x01\n\x0cUpdateFields\x12\x46\n\x11struct_expression\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x10structExpression\x12\x1d\n\nfield_name\x18\x02 \x01(\tR\tfieldName\x12\x44\n\x10value_expression\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x0fvalueExpression\x1ax\n\x05\x41lias\x12-\n\x04\x65xpr\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x04\x65xpr\x12\x12\n\x04name\x18\x02 \x03(\tR\x04name\x12\x1f\n\x08metadata\x18\x03 \x01(\tH\x00R\x08metadata\x88\x01\x01\x42\x0b\n\t_metadata\x1a\x9e\x01\n\x0eLambdaFunction\x12\x35\n\x08\x66unction\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08\x66unction\x12U\n\targuments\x18\x02 \x03(\x0b\x32\x37.spark.connect.Expression.UnresolvedNamedLambdaVariableR\targuments\x1a>\n\x1dUnresolvedNamedLambdaVariable\x12\x1d\n\nname_parts\x18\x01 \x03(\tR\tnamePartsB\x0b\n\texpr_type"\xec\x02\n\x1f\x43ommonInlineUserDefinedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12$\n\rdeterministic\x18\x02 \x01(\x08R\rdeterministic\x12\x37\n\targuments\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12\x39\n\npython_udf\x18\x04 \x01(\x0b\x32\x18.spark.connect.PythonUDFH\x00R\tpythonUdf\x12I\n\x10scalar_scala_udf\x18\x05 \x01(\x0b\x32\x1d.spark.connect.ScalarScalaUDFH\x00R\x0escalarScalaUdf\x12\x33\n\x08java_udf\x18\x06 \x01(\x0b\x32\x16.spark.connect.JavaUDFH\x00R\x07javaUdfB\n\n\x08\x66unction"\x9b\x01\n\tPythonUDF\x12\x38\n\x0boutput_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\noutputType\x12\x1b\n\teval_type\x18\x02 \x01(\x05R\x08\x65valType\x12\x18\n\x07\x63ommand\x18\x03 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x04 \x01(\tR\tpythonVer"\xb8\x01\n\x0eScalarScalaUDF\x12\x18\n\x07payload\x18\x01 \x01(\x0cR\x07payload\x12\x37\n\ninputTypes\x18\x02 \x03(\x0b\x32\x17.spark.connect.DataTypeR\ninputTypes\x12\x37\n\noutputType\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeR\noutputType\x12\x1a\n\x08nullable\x18\x04 \x01(\x08R\x08nullable"\x95\x01\n\x07JavaUDF\x12\x1d\n\nclass_name\x18\x01 \x01(\tR\tclassName\x12=\n\x0boutput_type\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\noutputType\x88\x01\x01\x12\x1c\n\taggregate\x18\x03 \x01(\x08R\taggregateB\x0e\n\x0c_output_type"l\n\x0c\x43\x61llFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments"\\\n\x17NamedArgumentExpression\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05valueB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' ) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) @@ -45,65 +45,67 @@ b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" ) _EXPRESSION._serialized_start = 105 - _EXPRESSION._serialized_end = 5875 - _EXPRESSION_WINDOW._serialized_start = 1645 - _EXPRESSION_WINDOW._serialized_end = 2428 - _EXPRESSION_WINDOW_WINDOWFRAME._serialized_start = 1935 - _EXPRESSION_WINDOW_WINDOWFRAME._serialized_end = 2428 - _EXPRESSION_WINDOW_WINDOWFRAME_FRAMEBOUNDARY._serialized_start = 2202 - _EXPRESSION_WINDOW_WINDOWFRAME_FRAMEBOUNDARY._serialized_end = 2347 - _EXPRESSION_WINDOW_WINDOWFRAME_FRAMETYPE._serialized_start = 2349 - _EXPRESSION_WINDOW_WINDOWFRAME_FRAMETYPE._serialized_end = 2428 - _EXPRESSION_SORTORDER._serialized_start = 2431 - _EXPRESSION_SORTORDER._serialized_end = 2856 - _EXPRESSION_SORTORDER_SORTDIRECTION._serialized_start = 2661 - _EXPRESSION_SORTORDER_SORTDIRECTION._serialized_end = 2769 - _EXPRESSION_SORTORDER_NULLORDERING._serialized_start = 2771 - _EXPRESSION_SORTORDER_NULLORDERING._serialized_end = 2856 - _EXPRESSION_CAST._serialized_start = 2859 - _EXPRESSION_CAST._serialized_end = 3004 - _EXPRESSION_LITERAL._serialized_start = 3007 - _EXPRESSION_LITERAL._serialized_end = 4570 - _EXPRESSION_LITERAL_DECIMAL._serialized_start = 3842 - _EXPRESSION_LITERAL_DECIMAL._serialized_end = 3959 - _EXPRESSION_LITERAL_CALENDARINTERVAL._serialized_start = 3961 - _EXPRESSION_LITERAL_CALENDARINTERVAL._serialized_end = 4059 - _EXPRESSION_LITERAL_ARRAY._serialized_start = 4062 - _EXPRESSION_LITERAL_ARRAY._serialized_end = 4192 - _EXPRESSION_LITERAL_MAP._serialized_start = 4195 - _EXPRESSION_LITERAL_MAP._serialized_end = 4422 - _EXPRESSION_LITERAL_STRUCT._serialized_start = 4425 - _EXPRESSION_LITERAL_STRUCT._serialized_end = 4554 - _EXPRESSION_UNRESOLVEDATTRIBUTE._serialized_start = 4573 - _EXPRESSION_UNRESOLVEDATTRIBUTE._serialized_end = 4759 - _EXPRESSION_UNRESOLVEDFUNCTION._serialized_start = 4762 - _EXPRESSION_UNRESOLVEDFUNCTION._serialized_end = 4966 - _EXPRESSION_EXPRESSIONSTRING._serialized_start = 4968 - _EXPRESSION_EXPRESSIONSTRING._serialized_end = 5018 - _EXPRESSION_UNRESOLVEDSTAR._serialized_start = 5020 - _EXPRESSION_UNRESOLVEDSTAR._serialized_end = 5102 - _EXPRESSION_UNRESOLVEDREGEX._serialized_start = 5104 - _EXPRESSION_UNRESOLVEDREGEX._serialized_end = 5190 - _EXPRESSION_UNRESOLVEDEXTRACTVALUE._serialized_start = 5193 - _EXPRESSION_UNRESOLVEDEXTRACTVALUE._serialized_end = 5325 - _EXPRESSION_UPDATEFIELDS._serialized_start = 5328 - _EXPRESSION_UPDATEFIELDS._serialized_end = 5515 - _EXPRESSION_ALIAS._serialized_start = 5517 - _EXPRESSION_ALIAS._serialized_end = 5637 - _EXPRESSION_LAMBDAFUNCTION._serialized_start = 5640 - _EXPRESSION_LAMBDAFUNCTION._serialized_end = 5798 - _EXPRESSION_UNRESOLVEDNAMEDLAMBDAVARIABLE._serialized_start = 5800 - _EXPRESSION_UNRESOLVEDNAMEDLAMBDAVARIABLE._serialized_end = 5862 - _COMMONINLINEUSERDEFINEDFUNCTION._serialized_start = 5878 - _COMMONINLINEUSERDEFINEDFUNCTION._serialized_end = 6242 - _PYTHONUDF._serialized_start = 6245 - _PYTHONUDF._serialized_end = 6400 - _SCALARSCALAUDF._serialized_start = 6403 - _SCALARSCALAUDF._serialized_end = 6587 - _JAVAUDF._serialized_start = 6590 - _JAVAUDF._serialized_end = 6739 - _CALLFUNCTION._serialized_start = 6741 - _CALLFUNCTION._serialized_end = 6849 - _NAMEDARGUMENTEXPRESSION._serialized_start = 6851 - _NAMEDARGUMENTEXPRESSION._serialized_end = 6943 + _EXPRESSION._serialized_end = 6064 + _EXPRESSION_WINDOW._serialized_start = 1744 + _EXPRESSION_WINDOW._serialized_end = 2527 + _EXPRESSION_WINDOW_WINDOWFRAME._serialized_start = 2034 + _EXPRESSION_WINDOW_WINDOWFRAME._serialized_end = 2527 + _EXPRESSION_WINDOW_WINDOWFRAME_FRAMEBOUNDARY._serialized_start = 2301 + _EXPRESSION_WINDOW_WINDOWFRAME_FRAMEBOUNDARY._serialized_end = 2446 + _EXPRESSION_WINDOW_WINDOWFRAME_FRAMETYPE._serialized_start = 2448 + _EXPRESSION_WINDOW_WINDOWFRAME_FRAMETYPE._serialized_end = 2527 + _EXPRESSION_SORTORDER._serialized_start = 2530 + _EXPRESSION_SORTORDER._serialized_end = 2955 + _EXPRESSION_SORTORDER_SORTDIRECTION._serialized_start = 2760 + _EXPRESSION_SORTORDER_SORTDIRECTION._serialized_end = 2868 + _EXPRESSION_SORTORDER_NULLORDERING._serialized_start = 2870 + _EXPRESSION_SORTORDER_NULLORDERING._serialized_end = 2955 + _EXPRESSION_CAST._serialized_start = 2958 + _EXPRESSION_CAST._serialized_end = 3103 + _EXPRESSION_LITERAL._serialized_start = 3106 + _EXPRESSION_LITERAL._serialized_end = 4669 + _EXPRESSION_LITERAL_DECIMAL._serialized_start = 3941 + _EXPRESSION_LITERAL_DECIMAL._serialized_end = 4058 + _EXPRESSION_LITERAL_CALENDARINTERVAL._serialized_start = 4060 + _EXPRESSION_LITERAL_CALENDARINTERVAL._serialized_end = 4158 + _EXPRESSION_LITERAL_ARRAY._serialized_start = 4161 + _EXPRESSION_LITERAL_ARRAY._serialized_end = 4291 + _EXPRESSION_LITERAL_MAP._serialized_start = 4294 + _EXPRESSION_LITERAL_MAP._serialized_end = 4521 + _EXPRESSION_LITERAL_STRUCT._serialized_start = 4524 + _EXPRESSION_LITERAL_STRUCT._serialized_end = 4653 + _EXPRESSION_UNRESOLVEDATTRIBUTE._serialized_start = 4672 + _EXPRESSION_UNRESOLVEDATTRIBUTE._serialized_end = 4858 + _EXPRESSION_GETCOLUMNBYORDINAL._serialized_start = 4860 + _EXPRESSION_GETCOLUMNBYORDINAL._serialized_end = 4948 + _EXPRESSION_UNRESOLVEDFUNCTION._serialized_start = 4951 + _EXPRESSION_UNRESOLVEDFUNCTION._serialized_end = 5155 + _EXPRESSION_EXPRESSIONSTRING._serialized_start = 5157 + _EXPRESSION_EXPRESSIONSTRING._serialized_end = 5207 + _EXPRESSION_UNRESOLVEDSTAR._serialized_start = 5209 + _EXPRESSION_UNRESOLVEDSTAR._serialized_end = 5291 + _EXPRESSION_UNRESOLVEDREGEX._serialized_start = 5293 + _EXPRESSION_UNRESOLVEDREGEX._serialized_end = 5379 + _EXPRESSION_UNRESOLVEDEXTRACTVALUE._serialized_start = 5382 + _EXPRESSION_UNRESOLVEDEXTRACTVALUE._serialized_end = 5514 + _EXPRESSION_UPDATEFIELDS._serialized_start = 5517 + _EXPRESSION_UPDATEFIELDS._serialized_end = 5704 + _EXPRESSION_ALIAS._serialized_start = 5706 + _EXPRESSION_ALIAS._serialized_end = 5826 + _EXPRESSION_LAMBDAFUNCTION._serialized_start = 5829 + _EXPRESSION_LAMBDAFUNCTION._serialized_end = 5987 + _EXPRESSION_UNRESOLVEDNAMEDLAMBDAVARIABLE._serialized_start = 5989 + _EXPRESSION_UNRESOLVEDNAMEDLAMBDAVARIABLE._serialized_end = 6051 + _COMMONINLINEUSERDEFINEDFUNCTION._serialized_start = 6067 + _COMMONINLINEUSERDEFINEDFUNCTION._serialized_end = 6431 + _PYTHONUDF._serialized_start = 6434 + _PYTHONUDF._serialized_end = 6589 + _SCALARSCALAUDF._serialized_start = 6592 + _SCALARSCALAUDF._serialized_end = 6776 + _JAVAUDF._serialized_start = 6779 + _JAVAUDF._serialized_end = 6928 + _CALLFUNCTION._serialized_start = 6930 + _CALLFUNCTION._serialized_end = 7038 + _NAMEDARGUMENTEXPRESSION._serialized_start = 7040 + _NAMEDARGUMENTEXPRESSION._serialized_end = 7132 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/expressions_pb2.pyi b/python/pyspark/sql/connect/proto/expressions_pb2.pyi index b590d22da2cc..b3b9d62d03f3 100644 --- a/python/pyspark/sql/connect/proto/expressions_pb2.pyi +++ b/python/pyspark/sql/connect/proto/expressions_pb2.pyi @@ -804,6 +804,37 @@ class Expression(google.protobuf.message.Message): self, oneof_group: typing_extensions.Literal["_plan_id", b"_plan_id"] ) -> typing_extensions.Literal["plan_id"] | None: ... + class GetColumnByOrdinal(google.protobuf.message.Message): + """An unresolved attribute that is represented by its column index.""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + ORDINAL_FIELD_NUMBER: builtins.int + PLAN_ID_FIELD_NUMBER: builtins.int + ordinal: builtins.int + """(Required) 0-based column index.""" + plan_id: builtins.int + """(Optional) The id of corresponding connect plan.""" + def __init__( + self, + *, + ordinal: builtins.int = ..., + plan_id: builtins.int | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal["_plan_id", b"_plan_id", "plan_id", b"plan_id"], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "_plan_id", b"_plan_id", "ordinal", b"ordinal", "plan_id", b"plan_id" + ], + ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_plan_id", b"_plan_id"] + ) -> typing_extensions.Literal["plan_id"] | None: ... + class UnresolvedFunction(google.protobuf.message.Message): """An unresolved function is not explicitly bound to one explicit function, but the function is resolved during analysis following Sparks name resolution rules. @@ -1126,6 +1157,7 @@ class Expression(google.protobuf.message.Message): COMMON_INLINE_USER_DEFINED_FUNCTION_FIELD_NUMBER: builtins.int CALL_FUNCTION_FIELD_NUMBER: builtins.int NAMED_ARGUMENT_EXPRESSION_FIELD_NUMBER: builtins.int + GET_COLUMN_BY_ORDINAL_FIELD_NUMBER: builtins.int EXTENSION_FIELD_NUMBER: builtins.int @property def literal(self) -> global___Expression.Literal: ... @@ -1164,6 +1196,8 @@ class Expression(google.protobuf.message.Message): @property def named_argument_expression(self) -> global___NamedArgumentExpression: ... @property + def get_column_by_ordinal(self) -> global___Expression.GetColumnByOrdinal: ... + @property def extension(self) -> google.protobuf.any_pb2.Any: """This field is used to mark extensions to the protocol. When plugins generate arbitrary relations they can add them here. During the planning the correct resolution is done. @@ -1189,6 +1223,7 @@ class Expression(google.protobuf.message.Message): common_inline_user_defined_function: global___CommonInlineUserDefinedFunction | None = ..., call_function: global___CallFunction | None = ..., named_argument_expression: global___NamedArgumentExpression | None = ..., + get_column_by_ordinal: global___Expression.GetColumnByOrdinal | None = ..., extension: google.protobuf.any_pb2.Any | None = ..., ) -> None: ... def HasField( @@ -1208,6 +1243,8 @@ class Expression(google.protobuf.message.Message): b"expression_string", "extension", b"extension", + "get_column_by_ordinal", + b"get_column_by_ordinal", "lambda_function", b"lambda_function", "literal", @@ -1251,6 +1288,8 @@ class Expression(google.protobuf.message.Message): b"expression_string", "extension", b"extension", + "get_column_by_ordinal", + b"get_column_by_ordinal", "lambda_function", b"lambda_function", "literal", @@ -1297,6 +1336,7 @@ class Expression(google.protobuf.message.Message): "common_inline_user_defined_function", "call_function", "named_argument_expression", + "get_column_by_ordinal", "extension", ] | None: ... diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 74da285ff1ee..6b1a6df1618d 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -3455,7 +3455,10 @@ def __getitem__(self, item: Union[int, str, Column, List, Tuple]) -> Union[Colum elif isinstance(item, (list, tuple)): return self.select(*item) elif isinstance(item, int): - jc = self._jdf.apply(self.columns[item]) + n = len(self.columns) + # 1, convert bool; 2, covert negative index; 3, validate index + item = range(0, n)[int(item)] + jc = self._jdf.apply(item) return Column(jc) else: raise PySparkTypeError( diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index 868d3f6f0aad..8aa6535e02b0 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -26,7 +26,6 @@ import io from contextlib import redirect_stdout -from pyspark import StorageLevel from pyspark.sql import SparkSession, Row, functions from pyspark.sql.functions import col, lit, count, sum, mean, struct from pyspark.sql.pandas.utils import pyarrow_version_less_than_minimum @@ -63,6 +62,51 @@ class DataFrameTestsMixin: + def test_getitem_invalid_indices(self): + df = self.spark.sql( + "SELECT * FROM VALUES " + "(1, 1.1, 'a'), " + "(2, 2.2, 'b'), " + "(4, 4.4, 'c') " + "AS TAB(a, b, c)" + ) + + # accepted type and values + for index in [False, True, 0, 1, 2, -1, -2, -3]: + df[index] + + # negative cases: ordinal out of range + for index in [-10, -4, 3, 10, 100]: + with self.assertRaises(IndexError): + df[index] + + # negative cases: unsupported types + for index in [None, 1.0, Decimal(1)]: + with self.assertRaises(PySparkTypeError): + df[index] + + def test_getitem_duplicated_column(self): + df = self.spark.sql( + "SELECT * FROM VALUES " + "(1, 1.1, 'a'), " + "(2, 2.2, 'b'), " + "(4, 4.4, 'c') " + "AS TAB(a, a, a)" + ) + + self.assertEqual( + df.select(df[0]).schema.simpleString(), + "struct", + ) + self.assertEqual( + df.select(df[1]).schema.simpleString(), + "struct", + ) + self.assertEqual( + df.select(df[2]).schema.simpleString(), + "struct", + ) + def test_range(self): self.assertEqual(self.spark.range(1, 1).count(), 0) self.assertEqual(self.spark.range(1, 0, -1).count(), 1) @@ -77,7 +121,6 @@ def test_duplicated_column_names(self): self.assertEqual(2, row[1]) self.assertEqual("Row(c=1, c=2)", str(row)) # Cannot access columns - self.assertRaises(AnalysisException, lambda: df.select(df[0]).first()) self.assertRaises(AnalysisException, lambda: df.select(df.c).first()) self.assertRaises(AnalysisException, lambda: df.select(df["c"]).first()) diff --git a/python/pyspark/sql/tests/test_group.py b/python/pyspark/sql/tests/test_group.py index 6981601cb129..85d253577414 100644 --- a/python/pyspark/sql/tests/test_group.py +++ b/python/pyspark/sql/tests/test_group.py @@ -149,6 +149,33 @@ def test_order_by_ordinal(self): with self.assertRaises(IndexError): df.orderBy(-3) + def test_order_by_ordinal_duplicated_column(self): + spark = self.spark + df = spark.createDataFrame( + [ + (1, 1), + (1, 2), + (2, 1), + (2, 2), + (3, 1), + (3, 2), + ], + ["a", "a"], + ) + + with self.tempView("v"): + df.createOrReplaceTempView("v") + + df1 = spark.sql("select * from v order by 2, 1;") + df2 = df.orderBy(2, 1) + assertSchemaEqual(df1.schema, df2.schema) + assertDataFrameEqual(df1, df2) + + df1 = spark.sql("select * from v order by 1 desc, 2;") + df2 = df.orderBy(-1, 2) + assertSchemaEqual(df1.schema, df2.schema) + assertDataFrameEqual(df1, df2) + class GroupTests(GroupTestsMixin, ReusedSQLTestCase): pass diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 2d9e0e231d7c..528904bb29a1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1393,6 +1393,11 @@ class Dataset[T] private[sql]( */ def apply(colName: String): Column = col(colName) + /** + * Selects column based on the column index (0-based) and returns it as a [[Column]]. + */ + private[sql] def apply(index: Int): Column = col(index) + /** * Specifies some hint on the current Dataset. As an example, the following code specifies * that one of the plan can be broadcasted: @@ -1445,6 +1450,13 @@ class Dataset[T] private[sql]( } } + /** + * Selects column based on the column index (0-based) and returns it as a [[Column]]. + */ + private[sql] def col(index: Int): Column = { + Column(addDataFrameIdToCol(queryExecution.analyzed.output(index))) + } + /** * Selects a metadata column based on its logical column name, and returns it as a [[Column]]. *