From 1ca0bfb226d2bd1c66e232217d89e3cc2f351e62 Mon Sep 17 00:00:00 2001 From: Jacques Nadeau Date: Sat, 2 Oct 2021 18:42:01 -0700 Subject: [PATCH] Updates to ideally support majority of tpch queries - Remove aggregate expressions type from generalized expressions. (only allow aggregate expressions as root expressions for aggregation) - Update function mapping to support options - Remove named structs from type unions (should only be used in special places as root, not in arbitrary hierarchy) - Add project, join, fetch, aggregate, sort, set logical relational operations. - Introduce key scalar and aggregate functions in functions yaml. - Remove old extensions docs Address #42, #43, #44 --- binary/expression.proto | 15 +- binary/extensions.proto | 5 + binary/parameterized_types.proto | 1 - binary/relations.proto | 78 +++++- binary/type.proto | 1 - binary/type_expressions.proto | 1 - extensions/aggregate_functions.yaml | 74 ++++-- extensions/organizations.yaml | 4 - extensions/scalar_functions.yaml | 362 ++++++++++++++++++++++++++-- extensions/toc.yaml | 9 - extensions/window_functions.yaml | 18 -- 11 files changed, 500 insertions(+), 68 deletions(-) delete mode 100644 extensions/organizations.yaml delete mode 100644 extensions/toc.yaml delete mode 100644 extensions/window_functions.yaml diff --git a/binary/expression.proto b/binary/expression.proto index 00321f377..156acafbb 100644 --- a/binary/expression.proto +++ b/binary/expression.proto @@ -13,7 +13,6 @@ message Expression { Literal literal = 1; FieldReference selection = 2; ScalarFunction scalar_function = 3; - AggregateFunction aggregate_function = 4; WindowFunction window_function = 5; IfThen if_then = 6; SwitchExpression switch_expression = 7; @@ -75,20 +74,34 @@ message Expression { message ScalarFunction { Extensions.FunctionId id = 1; repeated Expression args = 2; + Type output_type = 3; } message AggregateFunction { Extensions.FunctionId id = 1; repeated Expression args = 2; repeated SortField sorts = 3; + AggregationPhase phase = 4; + Type output_type = 5; } + enum AggregationPhase { + UNKNOWN = 0; + INITIAL_TO_INTERMEDIATE = 1; + INTERMEDIATE_TO_INTERMEDIATE = 2; + INITIAL_TO_RESULT = 3; + INTERMEDIATE_TO_RESULT = 4; + } + + message WindowFunction { Extensions.FunctionId id = 1; repeated Expression partitions = 2; repeated SortField sorts = 3; Bound upper_bound = 4; Bound lower_bound = 5; + AggregationPhase phase = 6; + Type output_type = 7; message Bound { diff --git a/binary/extensions.proto b/binary/extensions.proto index 54e4b512a..8c5793c04 100644 --- a/binary/extensions.proto +++ b/binary/extensions.proto @@ -47,6 +47,11 @@ message Extensions { ExtensionId extension_id = 2; string name = 3; uint32 index = 4; + repeated Option options = 5; + message Option { + string key = 1; + string value = 2; + } } } diff --git a/binary/parameterized_types.proto b/binary/parameterized_types.proto index fd4f57c32..ffcd9f933 100644 --- a/binary/parameterized_types.proto +++ b/binary/parameterized_types.proto @@ -32,7 +32,6 @@ message ParameterizedType { ParameterizedDecimal decimal = 24; ParameterizedStruct struct = 25; - ParameterizedNamedStruct named_struct = 26; ParameterizedList list = 27; ParameterizedMap map = 28; diff --git a/binary/relations.proto b/binary/relations.proto index 4ce2f5b1d..4ba081842 100644 --- a/binary/relations.proto +++ b/binary/relations.proto @@ -51,9 +51,14 @@ message ReadRel { Expression filter = 3; MaskExpression projection = 4; - oneof read_type { VirtualTable virtual_table = 5; + LocalFiles local_files = 6; + NamedTable named_table = 7; + } + + message NamedTable { + repeated string names = 1; } @@ -83,6 +88,56 @@ message ReadRel { } +message ProjectRel { + RelCommon common = 1; + Rel input = 2; + repeated Expression expressions = 3; +} + +message JoinRel { + RelCommon common = 1; + Rel left = 2; + Rel right = 3; + Expression expression = 4; + Expression post_join_filter = 5; + + enum JoinType { + UNKNOWN = 0; + INNER = 1; + OUTER = 2; + LEFT = 3; + RIGHT = 4; + } +} + +message FetchRel { + RelCommon common = 1; + Rel input = 2; + int64 offset = 3; + int64 count = 4; +} + +message AggregateRel { + RelCommon common = 1; + Rel input = 2; + repeated Grouping groupings = 3; + repeated Measure measures = 4; + Expression.AggregationPhase phase = 5; + + message Grouping { + repeated int32 input_fields = 1; + } + + message Measure { + Expression.AggregateFunction measure = 1; + } +} + +message SortRel { + RelCommon common = 1; + Rel input = 2; + repeated Expression.SortField sorts = 3; +} message FilterRel { RelCommon common = 1; @@ -90,10 +145,31 @@ message FilterRel { Expression condition = 3; } +message SetRel { + RelCommon common = 1; + repeated Rel inputs = 2; + SetOp op = 3; + + enum SetOp { + UNKNOWN = 0; + MINUS_PRIMARY = 1; + MINUS_MULTISET = 2; + INTERSECTION_PRIMARY = 3; + INTERSECTION_MULTISET = 4; + UNION_DISTINCT = 5; + UNION_ALL = 6; + } +} message Rel { oneof RelType { ReadRel read = 1; FilterRel filter = 2; + FetchRel fetch = 3; + AggregateRel aggregate = 4; + SortRel sort = 5; + JoinRel join = 6; + ProjectRel project = 7; + SetRel set = 8; } } \ No newline at end of file diff --git a/binary/type.proto b/binary/type.proto index 0a0056335..62c8e7f64 100644 --- a/binary/type.proto +++ b/binary/type.proto @@ -31,7 +31,6 @@ message Type { Decimal decimal = 24; Struct struct = 25; - NamedStruct named_struct = 26; List list = 27; Map map = 28; diff --git a/binary/type_expressions.proto b/binary/type_expressions.proto index 6dea0c46e..87fe5d762 100644 --- a/binary/type_expressions.proto +++ b/binary/type_expressions.proto @@ -32,7 +32,6 @@ message DerivationExpression { ExpressionDecimal decimal = 24; ExpressionStruct struct = 25; - ExpressionNamedStruct named_struct = 26; ExpressionList list = 27; ExpressionMap map = 28; diff --git a/extensions/aggregate_functions.yaml b/extensions/aggregate_functions.yaml index 2bdfff92d..356d4465f 100644 --- a/extensions/aggregate_functions.yaml +++ b/extensions/aggregate_functions.yaml @@ -1,18 +1,58 @@ functions: -- name: add - id: 0 - arguments: - - type: i32 - variadic: 2..N - return: i32 -- name: lt - id: 1 - arguments: - - type: i32 - - type: i32 - return: boolean -- name: and - id: 3 - arguments: - type: boolean - variadic: 2..N - return: boolean + - name: 'sum' + description: "Sum a set of values." + options: + overflow: [SILENT, SATURATE, ERROR] + variants: + - name: scalar + decomposable: MANY + parameters: + - K: [i8,i16,i32,i64,fp32,fp64] + arguments: + - type: K + intermediate: K + return: K + - name: decimal + decomposable: MANY + parameters: + integer: + - P: "1..38" + - S: "1..38" + arguments: + - type: "DECIMAL" + intermediate: "DECIMAL<38,S>" + return: "DECIMAL<38,S>" + - name: 'avg' + description: "Average a set of values." + options: + overflow: [SILENT, SATURATE, ERROR] + variants: + - name: scalar + decomposable: MANY + parameters: + - K: [i8,i16,i32,i64,fp32,fp64] + arguments: + - type: K + intermediate: "STRUCT" + return: K + - name: decimal + decomposable: MANY + parameters: + integer: + - P: "1..38" + - S: "1..38" + arguments: + - type: "DECIMAL" + intermediate: "STRUCT,count:i64>" + return: "DECIMAL<38,S>" + - name: 'count' + description: "Average a set of values." + options: + overflow: [SILENT, SATURATE, ERROR] + decomposable: MANY + parameters: + - K + arguments: + - type: K + intermediate: i64 + return: i64 diff --git a/extensions/organizations.yaml b/extensions/organizations.yaml deleted file mode 100644 index 1f85194c0..000000000 --- a/extensions/organizations.yaml +++ /dev/null @@ -1,4 +0,0 @@ -organizations: -- name: Substrait - id: 0 - extensions: "github://substrait-io/spec/extensions/toc.yaml" diff --git a/extensions/scalar_functions.yaml b/extensions/scalar_functions.yaml index 2bdfff92d..53601868e 100644 --- a/extensions/scalar_functions.yaml +++ b/extensions/scalar_functions.yaml @@ -1,18 +1,350 @@ -functions: -- name: add - id: 0 +- name: '+' + description: "Add two numeric values." + options: + overflow: [SILENT, SATURATE, ERROR] + variants: + - variant: scalar + parameters: + - K: [i8,i16,i32,i64,fp32,fp64] + arguments: + - type: K + - type: K + return: K + - variant: decimal + parameters: + integer: + - P1: "1..38" + - S1: "1..38" + - P2: "1..38" + - S2: "1..38" + arguments: + - type: "DECIMAL" + - type: "DECIMAL" + return: |- + init_scale = max(S1,S2) + init_prec = init_scale + max(P1 - S1, P2-S2) + 1 + min_scale = min(init_scale, 6) + delta = init_prec - 38 + prec = min(init_prec,38) + scale = init_prec > 38 ? scale - init_prec + 38 : min_scale + DECIMAL + - variant: date/time plus interval year + description: Add an interval to a date/time type. + parameters: + - T: [timestamp, timestamp_tz, date] + - I: [interval_year, interval_day] + arguments: + - type: T + - type: I + return: T +- name: '-' + description: "Subtract one operand from another." + options: + overflow: [SILENT, SATURATE, ERROR] + variants: + - variant: scalar + parameters: + - K: [i8,i16,i32,i64,fp32,fp64] + arguments: + - type: K + - type: K + return: K + - variant: decimal + parameters: + integer: + - P1: "1..38" + - S1: "1..38" + - P2: "1..38" + - S2: "1..38" + arguments: + - type: "DECIMAL" + - type: "DECIMAL" + return: |- + init_scale = max(S1,S2) + init_prec = init_scale + max(P1 - S1, P2-S2) + 1 + min_scale = min(init_scale, 6) + delta = init_prec - 38 + prec = min(init_prec,38) + scale = init_prec > 38 ? scale - init_prec + 38 : min_scale + DECIMAL +- name: '*' + description: Multiply two numeric values. + options: + overflow: [SILENT, SATURATE, ERROR] + variants: + - variant: scalar + parameters: + - K: [i8,i16,i32,i64,fp32,fp64] + arguments: + - type: K + - type: K + return: K + - variant: decimal + parameters: + integer: + - P1: "1..38" + - S1: "1..38" + - P2: "1..38" + - S2: "1..38" + arguments: + - type: "DECIMAL" + - type: "DECIMAL" + return: |- + init_scale = S1 + S2 + init_prec = P1 + P2 + 1 + min_scale = min(init_scale, 6) + delta = init_prec - 38 + prec = min(init_prec,38) + scale = init_prec > 38 ? scale - init_prec + 38 : min_scale + DECIMAL +- name: '/' + description: "Divide two numeric values." + options: + overflow: [SILENT, SATURATE, ERROR] + variants: + - variant: divide + parameters: + - K: [i8,i16,i32,i64,fp32,fp64] + arguments: + - type: K + - type: K + return: K + - variant: decimal + parameters: + integer: + - P1: "1..38" + - S1: "1..38" + - P2: "1..38" + - S2: "1..38" + arguments: + - type: "DECIMAL" + - type: "DECIMAL" + return: |- + init_scale = max(6, S1 + P2 + 1) + init_prec = P1 - S1 + P2 + init_scale + min_scale = min(init_scale, 6) + delta = init_prec - 38 + prec = min(init_prec,38) + scale = init_prec > 38 ? scale - init_prec + 38 : min_scale + DECIMAL +- name: ['mod','%'] + description: "Modulus of two values." + variants: + - variant: divide + parameters: + - K: [i8,i16,i32,i64,fp32,fp64] + arguments: + - type: K + - type: K + return: K + - variant: decimal + parameters: + integer: + - P1: "1..38" + - S1: "1..38" + - P2: "1..38" + - S2: "1..38" + arguments: + - type: "DECIMAL" + - type: "DECIMAL" + return: |- + init_scale = max(S1,S2) + init_prec = min(P1 - S1, P2 - S2) + init_scale + min_scale = min(init_scale, 6) + delta = init_prec - 38 + prec = min(init_prec,38) + scale = init_prec > 38 ? scale - init_prec + 38 : min_scale + DECIMAL +- name: like + description: Determine whether a string matches a particular pattern using SQL standard like syntax. + options: + case: [SENSITIVE, INSENSITIVE] + parameters: + - S1: ["VARCHAR", "FIXEDCHAR", "STRING"] + - S2: ["VARCHAR", "FIXEDCHAR", "STRING"] arguments: - - type: i32 - variadic: 2..N - return: i32 -- name: lt - id: 1 - arguments: - - type: i32 - - type: i32 + - type: S1 + name: value to test + - type: S2 + name: pattern + constant: true return: boolean -- name: and - id: 3 - arguments: - type: boolean - variadic: 2..N +- template: "comparisons" + variants: + - variant: simple + parameters: + - K: [boolean,i8,i16,i32,i64,fp32,fp64,timestamp,timestamp_tz,date,time] + arguments: + - type: K + - type: K + return: boolean + - variant: string comparisons + parameters: + - S1: ["VARCHAR", "FIXEDCHAR", "STRING"] + - S2: ["VARCHAR", "FIXEDCHAR", "STRING"] + arguments: + - type: S1 + - type: S2 + return: boolean + - variant: binary comparisons + parameters: + - S1: ["BINARY", "FIXEDBINARY"] + - S1: ["BINARY", "FIXEDBINARY"] + arguments: + - type: S1 + - type: S2 + return: boolean +- name: "=" + extends: comparisons + description: Whether two values equal eachother. + variants: + - variant: equality_only_types + parameters: + - K: [uuid, interval_year, interval_day] + arguments: + - type: K + - type: K + return: boolean +- name: "is_not_distinct_from" + extends: comparisons + description: Whether two values equal eachother (nulls are considered equal). + variants: + - variant: equality_only_types + parameters: + - K: [uuid, interval_year, interval_day] + arguments: + - type: K + - type: K + return: boolean +- name: ["!=", "<>"] + extends: comparisons + description: Whether two values are not equal to eachother. +- name: "<" + extends: comparisons + description: Whether the first value is less than the second value. +- name: "<=" + extends: comparisons + description: Whether the first value is less than or equal to the second value. +- name: ">" + extends: comparisons + description: Whether the first value is greater than the second value. +- name: ">=" + extends: comparisons + description: Whether the first value is greater than or equal to the second value. +- name: compare + description: |- + Whether the first value is greater than, less than or equal to the second value. Returns -1 if first + value is less than second, 0 if the two values are equal or 1 if the first value is greater than the second. + options: + nulls: [FIRST, LAST] + variants: + - variant: simple + parameters: + - K: [boolean,i8,i16,i32,i64,fp32,fp64,timestamp,timestamp_tz,date,time,interval_year] + arguments: + - type: K + - type: K + return: i8 + - variant: string comparisons + parameters: + - S1: ["VARCHAR", "FIXEDCHAR", "STRING"] + - S2: ["VARCHAR", "FIXEDCHAR", "STRING"] + arguments: + - type: S1 + - type: S2 + return: i8 +- name: cast + description: Convert value from one type to another. + variants: + - variant: string + description: Convert value to string. + parameters: + - V: [boolean, i8, i16, i32, i64, fp32, fp64, timestamp_tz, date, time, interval_year, interval_day] + - S: [VARCHAR(N), FIXEDCHAR(N), STRING] + arguments: + - type: S + kind: TYPE + name: Target type. + - type: V + name: Value to cast. +- name: extract + description: Extract portion of a date/time value. + parameters: + - D: [timestamp, timestamp_tz, date, time] + arguments: + - type: D + name: Date/time value to extract information from. + - type: STRING + name: The part of the value to extract. + constant: TRUE + return: i64 +- template: boolean + arugments: + - type: boolean + - type: boolean return: boolean +- name: not + description: The boolean not of a provided value + arugments: + - type: boolean + return: boolean +- name: and + extends: boolean + description: The boolean and of two values. +- name: or + extends: boolean + description: The boolean or of two values. +- name: xor + extends: boolean + description: The boolean xor of two values. +- name: concat + variants: + - variant: fixed-width strings + arguments: + - type: FIXEDCHAR + - type: FIXEDCHAR + return: FIXEDCHAR + - variant: variable defined-length strings + parameters: + - S1: [FIXEDCHAR, VARCHAR] + - S2: [FIXEDCHAR, VARCHAR] + arguments: + - type: S1 + - type: S2 + return VARCHAR + - variant: arbitrary length values + parameters: + - S1: ["VARCHAR", "FIXEDCHAR", "STRING"] + - S2: ["VARCHAR", "FIXEDCHAR", "STRING"] + arguments: + - type: S1 + - type: S2 + return: string + - variant: fixed-width binary + arguments: + - type: FIXEDBINARY + - type: FIXEDBINARY + return: FIXEDBINARY + - variant: binary + parameters: + - S1: [ fixedbinary, binary ] + - S2: [ fixedbinary, binary ] + arguments: + - type: S1 + - type: S2 + return: binary +- name: substring + description: Get a sub portion of the given string. + parameters: + - S: ["VARCHAR", "FIXEDCHAR", "STRING"] + arguments: + - type: S + name: String to extract from. + - type: i32 + name: Start position (inclusive, zero index). Negatives starts from end of string. + - type: i32 + name: End position (exclusive, zero index). Negative starts from end of string. + + + + diff --git a/extensions/toc.yaml b/extensions/toc.yaml deleted file mode 100644 index 25881cdcb..000000000 --- a/extensions/toc.yaml +++ /dev/null @@ -1,9 +0,0 @@ -organization: 0 -license: aplv2 -name: Substrait -extensions: - functions: - scalar: [https://github.com/substrait-io/spec/blob/main/extensions/scalar_functions.yaml] - aggregate: [https://github.com/substrait-io/spec/blob/main/extensions/aggregate_functions.yaml] - window: [https://github.com/substrait-io/spec/blob/main/extensions/window_functions.yaml] - types: [https://github.com/substrait-io/spec/blob/main/extensions/extension_types.yaml] diff --git a/extensions/window_functions.yaml b/extensions/window_functions.yaml deleted file mode 100644 index 2bdfff92d..000000000 --- a/extensions/window_functions.yaml +++ /dev/null @@ -1,18 +0,0 @@ -functions: -- name: add - id: 0 - arguments: - - type: i32 - variadic: 2..N - return: i32 -- name: lt - id: 1 - arguments: - - type: i32 - - type: i32 - return: boolean -- name: and - id: 3 - arguments: - type: boolean - variadic: 2..N - return: boolean