From a2b6be92037a30b89294534c82b2c8ce03eda1d0 Mon Sep 17 00:00:00 2001 From: Jacques Nadeau Date: Thu, 9 Sep 2021 14:57:04 -0700 Subject: [PATCH 1/2] Add type variations --- extensions/type_variations.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 extensions/type_variations.yaml diff --git a/extensions/type_variations.yaml b/extensions/type_variations.yaml new file mode 100644 index 000000000..cf362eaa2 --- /dev/null +++ b/extensions/type_variations.yaml @@ -0,0 +1,13 @@ +types: +- string: + - name: dict + description: a dictionary encoded string + - name: bigoffset + description: The arrow large string representation of strings, still restricted to the default string size defined in Substrait. +- struct: + - name: avro + description: an avro encoded struct + - name: cstruct + description: a cstruct representation of the struct + - name: dict + description: a arrow utf8 value using From 7d32ae634d03e1ba6a4b833387c2f4decccf7788 Mon Sep 17 00:00:00 2001 From: Jacques Nadeau Date: Thu, 9 Sep 2021 15:06:09 -0700 Subject: [PATCH 2/2] Remove physical types. --- binary/type.proto | 210 ++---------------------------- extensions/type_variations.yaml | 13 -- site/docs/spec/specification.md | 5 +- site/docs/types/_config | 1 - site/docs/types/physical_types.md | 37 ------ 5 files changed, 15 insertions(+), 251 deletions(-) delete mode 100644 extensions/type_variations.yaml delete mode 100644 site/docs/types/physical_types.md diff --git a/binary/type.proto b/binary/type.proto index a8052fe29..242185e33 100644 --- a/binary/type.proto +++ b/binary/type.proto @@ -29,243 +29,72 @@ message Type { UserDefined user_defined = 31; } - message I8 { - PhysicalType physical_type = 1; + message I8 {} - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } + message I16 {} - message U8 { - PhysicalType physical_type = 1; + message I32 {} - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } + message I64 {} - message I16 { - PhysicalType physical_type = 1; + message FP32 {} - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } + message FP64 {} - message U16 { - PhysicalType physical_type = 1; + message String {} - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } + message Binary {} - message I32 { - PhysicalType physical_type = 1; + message Timestamp {} - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } + message Date {} - message U32 { - PhysicalType physical_type = 1; + message Time {} - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } - - message I64 { - PhysicalType physical_type = 1; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } + message IntervalYear {} - message U64 { - PhysicalType physical_type = 1; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } - - message FP16 { - PhysicalType physical_type = 1; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } - - message FP32 { - PhysicalType physical_type = 1; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } - - message FP64 { - PhysicalType physical_type = 1; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } - - message String { - PhysicalType physical_type = 1; - bool dictionary_encoded = 2; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - ARROW_LARGE_STRING = 1; - } - } - - message Binary { - PhysicalType physical_type = 1; - bool dictionary_encoded = 2; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - ARROW_LARGE_BINARY = 1; - } - } - - message Timestamp { - PhysicalType physical_type = 1; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } - - message Date { - PhysicalType physical_type = 1; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } - - message Time { - PhysicalType physical_type = 1; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } - - message IntervalYear { - PhysicalType physical_type = 1; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } - } - - message IntervalDay { - PhysicalType physical_type = 1; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - ARROW_MONTH_DAY_NANO = 1; - } - } + message IntervalDay {} // Start compound types. message FixedChar { int32 length = 1; - PhysicalType physical_type = 2; - bool dictionary_encoded = 3; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } } message VarChar { int32 length = 1; - PhysicalType physical_type = 2; - bool dictionary_encoded = 3; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } } message FixedBinary { int32 length = 1; - PhysicalType physical_type = 2; - bool dictionary_encoded = 3; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } } message Decimal { int32 scale = 1; int32 precision = 2; - PhysicalType physical_type = 3; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - ARROW_128 = 1; - } } message Struct { repeated Type types = 1; - PhysicalType physical_type = 2; - bool dictionary_encoded = 3; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } } message NamedStruct { repeated Pair pairs = 1; - PhysicalType physical_type = 2; - bool dictionary_encoded = 3; message Pair { string name = 1; Type type = 2; } - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } } message List { Type type = 1; - PhysicalType physical_type = 2; - bool dictionary_encoded = 3; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } } message Map { repeated KeyValue key_values = 1; - PhysicalType physical_type = 2; - bool dictionary_encoded = 3; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - UTF8_ORDERED_KEYS = 1; - } message KeyValue { Type key = 1; @@ -275,24 +104,11 @@ message Type { message TimestampMicroTZ { string timezone = 1; - - PhysicalType physical_type = 2; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } } message TimestampTZ { string timezone = 1; - - PhysicalType physical_type = 2; - - enum PhysicalType { - SYSTEM_DEFAULT = 0; - } } - message UserDefined { int32 organization = 1; string name = 2; diff --git a/extensions/type_variations.yaml b/extensions/type_variations.yaml deleted file mode 100644 index cf362eaa2..000000000 --- a/extensions/type_variations.yaml +++ /dev/null @@ -1,13 +0,0 @@ -types: -- string: - - name: dict - description: a dictionary encoded string - - name: bigoffset - description: The arrow large string representation of strings, still restricted to the default string size defined in Substrait. -- struct: - - name: avro - description: an avro encoded struct - - name: cstruct - description: a cstruct representation of the struct - - name: dict - description: a arrow utf8 value using diff --git a/site/docs/spec/specification.md b/site/docs/spec/specification.md index 9945b891e..846a67dfc 100644 --- a/site/docs/spec/specification.md +++ b/site/docs/spec/specification.md @@ -19,9 +19,8 @@ Once all portions of the specification have been moved to commit (or eliminated) | Priority | Status | Section | Description | | -------- | ------ | ------------------------------------------------------------ | ------------------------------------------------------------ | -| 1 | sketch | [Simple Logical Types](/types/simple_logical_types) | A way to describe the set of basic types that will be operated on within a plan. Only includes simple types such as integers and doubles (nothing configurable or compound). | -| | sketch | [Compound Logical Types](/types/compound_logical_types) | Expression of types that go beyond simple scalar values. Key concepts here include: configurable types such as fixed length and numeric types as well as compound types such as structs, maps, lists, etc. | -| | sketch | [Physical Types](/types/physical_types) | Physical extensions to logical types. | +| 1 | sketch | [Simple Types](/types/simple_logical_types) | A way to describe the set of basic types that will be operated on within a plan. Only includes simple types such as integers and doubles (nothing compound). | +| | sketch | [Compound Types](/types/compound_logical_types) | Expression of types that go beyond simple scalar values. Key concepts here include: configurable types such as fixed length and numeric types as well as compound types such as structs, maps, lists, etc. | | | sketch | [User Defined Types](/types/user_defined_types) | Extensions that can be defined for specific IR producers/consumers. | | 2 | sketch | [Field References](/expressions/field_references) | Expressions to identify which portions of a record should be | | 3 | sketch | [Scalar Functions](/expressions/scalar_functions) | Description of how functions are specified. Concepts include arguments, variadic functions, output type derivation, etc. | diff --git a/site/docs/types/_config b/site/docs/types/_config index 70784218f..acbdad2f8 100644 --- a/site/docs/types/_config +++ b/site/docs/types/_config @@ -1,5 +1,4 @@ arrange: - simple_logical_types.md - compound_logical_types.md - - physical_types.md - user_defined_types.md \ No newline at end of file diff --git a/site/docs/types/physical_types.md b/site/docs/types/physical_types.md deleted file mode 100644 index aad6339b3..000000000 --- a/site/docs/types/physical_types.md +++ /dev/null @@ -1,37 +0,0 @@ -# Physical Types - -Since Substrait is designed to work in both logical and physical contexts, there is need to support extended attributes in the physical context. - -For each logical type, we declare one or more physical representations of that logical type as approrpriate to the system specializations. Additionally, we describe whether a particular type is dictionary encoded. Each of these representation details is also used when specifiying a function signature to determine which of the specific physical representations of data are supported by a paticular function signature. - -In many cases, a system will only have a single physical representation of each type. In those cases, it is expected that the binding of an operation is associated with the system default representation of the data. While a physical types are defined as discrete from logical types within the specification, the serialization formats will typically collapse these into a singular concept. - -| Logical Type | Physical Representations | Support Dictionary Encoding | -| ------------- | ----------------------------------------------------------- | --------------------------- | -| boolean | 0=System default | no | -| i8 | 0=System default | no | -| i16 | 0=System default | no | -| i32 | 0=System default | no | -| i64 | 0=System default | no | -| fp32 | 0=System default | no | -| fp64 | 0=System default | no | -| string | 0=System default, 1=Arrow Large String | yes | -| binary | 0=System default, Arrow Large Binary | yes | -| timestamp | 0=System default | no | -| date | 0=System default | no | -| time | 0=System default | no | -| interval_year | 0=System default | no | -| interval_day | 0=System default, 1=Arrow MONTH_DAY_NANO | no | -| fixedchar | 0=System default | yes | -| varchar | 0=System default | yes | -| fixedbinary | 0=System default | yes | -| decimal | 0=System default, 1=Arrow 128 Bit Width | no | -| struct | 0=System default | yes | -| list | 0=System default, 1=Arrow Large List | yes | -| map | 0=System default, 1=Map where keys are utf8 ordered strings | yes | -| timestamp_tz | 0=System default | no | - - - - -