From 6e0235a1767ed2fa9577ff6b1135fe0f9fdfa369 Mon Sep 17 00:00:00 2001 From: angerszhu Date: Mon, 4 Jan 2021 18:56:13 +0800 Subject: [PATCH 01/25] [SPARK-33976][SQL] Spark script TRANSFORM related change doc --- docs/_data/menu-sql.yaml | 2 + docs/sql-ref-syntax-qry-select-transform.md | 165 ++++++++++++++++++++ docs/sql-ref-syntax-qry-select.md | 8 +- docs/sql-ref-syntax-qry.md | 1 + docs/sql-ref-syntax.md | 1 + 5 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 docs/sql-ref-syntax-qry-select-transform.md diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml index cda2a1a5139a..1123521087be 100644 --- a/docs/_data/menu-sql.yaml +++ b/docs/_data/menu-sql.yaml @@ -188,6 +188,8 @@ url: sql-ref-syntax-qry-select-lateral-view.html - text: PIVOT Clause url: sql-ref-syntax-qry-select-pivot.html + - text: TRANSFORM Clause + url: sql-ref-syntax-qry-select-transform.html - text: EXPLAIN url: sql-ref-syntax-qry-explain.html - text: Auxiliary Statements diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md new file mode 100644 index 000000000000..1928563b2f15 --- /dev/null +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -0,0 +1,165 @@ +--- +layout: global +title: TRANSFORM +displayTitle: TRANSFORM +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +### Description + +The `TRANSFORM` clause is used to specifies a hive-style transform (SELECT TRANSFORM/MAP/REDUCE) +query specification to transform the input by forking and running the specified script. Users can +plug in their own custom mappers and reducers in the data stream by using features natively supported +in the Spark/Hive language. e.g. in order to run a custom mapper script - map_script - and a custom +reducer script - reduce_script - the user can issue the following command which uses the TRANSFORM +clause to embed the mapper and the reducer scripts. + +Currently, Spark's script transform support two mode: + + 1. Without Hive: It means we run Spark SQL without hive support, in this mode, we can use default format + by treating data as STRING and use Spark's own SerDe. + 2. WIth Hive: It means we run Spark SQL with Hive support, in this mode, when we use default format, + it will be treated as Hive default fomat. And we can use Hive supported SerDe to process data. + +In both mode with default format, columns will be transformed to STRING and delimited by TAB before feeding +to the user script, Similarly, all NULL values will be converted to the literal string \N in order to +differentiate NULL values from empty strings. The standard output of the user script will be treated as +TAB-separated STRING columns, any cell containing only \N will be re-interpreted as a NULL, and then the +resulting STRING column will be cast to the data type specified in the table declaration in the usual way. +User scripts can output debug information to standard error which will be shown on the task detail page on hadoop. +These defaults can be overridden with `ROW FORMAT DELIMITED`. + +### Syntax + +```sql +rowFormat + : ROW FORMAT SERDE serde_name [ WITH SERDEPROPERTIES serde_props ] + | ROW FORMAT DELIMITED + [ FIELDS TERMINATED BY fieldsTerminatedBy [ ESCAPED BY escapedBy ] ] + [ COLLECTION ITEMS TERMINATED BY collectionItemsTerminatedBy ] + [ MAP KEYS TERMINATED BY keysTerminatedBy ] + [ LINES TERMINATED BY linesSeparatedBy ] + [ NULL DEFINED AS nullDefinedAs ] + +inRowFormat=rowFormat +outRowFormat=rowFormat +namedExpressionSeq = named_expression [ , ... ] + +transformClause: + SELECT [ TRANSFORM ( namedExpressionSeq ) | MAP namedExpressionSeq | REDUCE namedExpressionSeq ] + [ inRowFormat ] + [ RECORDWRITER recordWriter ] + USING script + [ AS ( [ identifierSeq | colTypeList ] [ , ... ] ) ] + [ outRowFormat ] + [ RECORDREADER recordReader ] + [ WHERE boolean_expression ] + [ GROUP BY expression [ , ... ] ] + [ HAVING boolean_expression ] +``` + +### Parameters + +* **named_expression** + + xxx. + +* **serde_name** + + xxx. + +* **serde_props** + + xxx. + +* **fieldsTerminatedBy** + + xxx. + +* **escapedBy** + + xxx. + +* **collectionItemsTerminatedBy** + + xxx. + +* **keysTerminatedBy** + + xxx. + +* **linesSeparatedBy** + + xxx. + +* **nullDefinedAs** + + xxx. + +* **rowFormat** + + xxx. + +* **recordWriter** + + xxx. + +* **recordReader** + + xxx. + +* **identifierSeq** + + xxx. + +* **recordReader** + + xxx. + +### Without Hive support Mode + +### With Hive Support Mode + +### Schema-less Script Transforms + +If there is no AS clause after USING my_script, Spark assumes that the output of the script contains 2 parts: + + 1. key: which is before the first tab, + 2. value: which is the rest after the first tab. + +Note that this is different from specifying AS key, value because in that case, value will only contain the portion +between the first tab and the second tab if there are multiple tabs. + +### Examples + +```sql + +``` + +### Related Statements + +* [SELECT Main](sql-ref-syntax-qry-select.html) +* [WHERE Clause](sql-ref-syntax-qry-select-where.html) +* [GROUP BY Clause](sql-ref-syntax-qry-select-groupby.html) +* [HAVING Clause](sql-ref-syntax-qry-select-having.html) +* [ORDER BY Clause](sql-ref-syntax-qry-select-orderby.html) +* [SORT BY Clause](sql-ref-syntax-qry-select-sortby.html) +* [DISTRIBUTE BY Clause](sql-ref-syntax-qry-select-distribute-by.html) +* [LIMIT Clause](sql-ref-syntax-qry-select-limit.html) +* [CASE Clause](sql-ref-syntax-qry-select-case.html) +* [PIVOT Clause](sql-ref-syntax-qry-select-pivot.html) +* [LATERAL VIEW Clause](sql-ref-syntax-qry-select-lateral-view.html) diff --git a/docs/sql-ref-syntax-qry-select.md b/docs/sql-ref-syntax-qry-select.md index bac7c2bc6a06..4b4523c21a15 100644 --- a/docs/sql-ref-syntax-qry-select.md +++ b/docs/sql-ref-syntax-qry-select.md @@ -41,7 +41,7 @@ select_statement [ { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select_stat While `select_statement` is defined as ```sql -SELECT [ hints , ... ] [ ALL | DISTINCT ] { named_expression [ , ... ] } +SELECT [ hints , ... ] [ ALL | DISTINCT ] { [ named_expression [ , ... ] | TRANSFORM Clause ] } FROM { from_item [ , ... ] } [ PIVOT clause ] [ LATERAL VIEW clause ] [ ... ] @@ -152,6 +152,11 @@ SELECT [ hints , ... ] [ ALL | DISTINCT ] { named_expression [ , ... ] } Specifies aliases for one or more source window specifications. The source window specifications can be referenced in the widow definitions in the query. +* **TRANSFORM** + + Specifies a hive-style transform (SELECT TRANSFORM/MAP/REDUCE) query specification to transform + the input by forking and running the specified script. + ### Related Statements * [WHERE Clause](sql-ref-syntax-qry-select-where.html) @@ -175,3 +180,4 @@ SELECT [ hints , ... ] [ ALL | DISTINCT ] { named_expression [ , ... ] } * [CASE Clause](sql-ref-syntax-qry-select-case.html) * [PIVOT Clause](sql-ref-syntax-qry-select-pivot.html) * [LATERAL VIEW Clause](sql-ref-syntax-qry-select-lateral-view.html) +* [TRANSFORM Clause](sql-ref-syntax-qry-select-transform.html) diff --git a/docs/sql-ref-syntax-qry.md b/docs/sql-ref-syntax-qry.md index 6751b90e1244..9fb62dfd548e 100644 --- a/docs/sql-ref-syntax-qry.md +++ b/docs/sql-ref-syntax-qry.md @@ -49,4 +49,5 @@ ability to generate logical and physical plan for a given query using * [CASE Clause](sql-ref-syntax-qry-select-case.html) * [PIVOT Clause](sql-ref-syntax-qry-select-pivot.html) * [LATERAL VIEW Clause](sql-ref-syntax-qry-select-lateral-view.html) + * [TRANSFORM Clause](sql-ref-syntax-qry-select-transform.html) * [EXPLAIN Statement](sql-ref-syntax-qry-explain.html) diff --git a/docs/sql-ref-syntax.md b/docs/sql-ref-syntax.md index f3d35b57d90c..f0b993601876 100644 --- a/docs/sql-ref-syntax.md +++ b/docs/sql-ref-syntax.md @@ -70,6 +70,7 @@ Spark SQL is Apache Spark's module for working with structured data. The SQL Syn * [CASE Clause](sql-ref-syntax-qry-select-case.html) * [PIVOT Clause](sql-ref-syntax-qry-select-pivot.html) * [LATERAL VIEW Clause](sql-ref-syntax-qry-select-lateral-view.html) + * [TRANSFORM Clause](sql-ref-syntax-qry-select-transform.html) * [EXPLAIN](sql-ref-syntax-qry-explain.html) ### Auxiliary Statements From 7dbfebfbb9badf9ff0b327d86eaad14191f4406a Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 1 Apr 2021 16:10:33 +0800 Subject: [PATCH 02/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 100 ++++++++++++-------- 1 file changed, 63 insertions(+), 37 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 1928563b2f15..3406bd1984e0 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -47,9 +47,9 @@ These defaults can be overridden with `ROW FORMAT DELIMITED`. ```sql rowFormat - : ROW FORMAT SERDE serde_name [ WITH SERDEPROPERTIES serde_props ] + : ROW FORMAT SERDE serde_class [ WITH SERDEPROPERTIES serde_props ] | ROW FORMAT DELIMITED - [ FIELDS TERMINATED BY fieldsTerminatedBy [ ESCAPED BY escapedBy ] ] + [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escapedBy ] ] [ COLLECTION ITEMS TERMINATED BY collectionItemsTerminatedBy ] [ MAP KEYS TERMINATED BY keysTerminatedBy ] [ LINES TERMINATED BY linesSeparatedBy ] @@ -62,11 +62,11 @@ namedExpressionSeq = named_expression [ , ... ] transformClause: SELECT [ TRANSFORM ( namedExpressionSeq ) | MAP namedExpressionSeq | REDUCE namedExpressionSeq ] [ inRowFormat ] - [ RECORDWRITER recordWriter ] + [ RECORDWRITER recordWriter_class ] USING script - [ AS ( [ identifierSeq | colTypeList ] [ , ... ] ) ] + [ AS ( [ col_name [ col_type ]] [ , ... ] ) ] [ outRowFormat ] - [ RECORDREADER recordReader ] + [ RECORDREADER recordReader_class ] [ WHERE boolean_expression ] [ GROUP BY expression [ , ... ] ] [ HAVING boolean_expression ] @@ -76,59 +76,85 @@ transformClause: * **named_expression** - xxx. + An expression with an assigned name. In general, it denotes a column expression. -* **serde_name** + **Syntax:** `expression [AS] [alias]` - xxx. +* **row_format** -* **serde_props** + Use the `SERDE` clause to specify a custom SerDe for one table. Otherwise, use the `DELIMITED` clause to use the native SerDe and specify the delimiter, escape character, null character and so on. - xxx. +* **SERDE** -* **fieldsTerminatedBy** + Specifies a custom SerDe for one table. - xxx. +* **serde_class** -* **escapedBy** + Specifies a fully-qualified class name of a custom SerDe. - xxx. +* **SERDEPROPERTIES** -* **collectionItemsTerminatedBy** + A list of key-value pairs that is used to tag the SerDe definition. - xxx. +* **DELIMITED** -* **keysTerminatedBy** + The `DELIMITED` clause can be used to specify the native SerDe and state the delimiter, escape character, null character and so on. + +* **FIELDS TERMINATED BY** - xxx. + Used to define a column separator. + +* **COLLECTION ITEMS TERMINATED BY** -* **linesSeparatedBy** + Used to define a collection item separator. + +* **MAP KEYS TERMINATED BY** - xxx. + Used to define a map key separator. + +* **LINES TERMINATED BY** -* **nullDefinedAs** + Used to define a row separator. + +* **NULL DEFINED AS** - xxx. + Used to define the specific value for NULL. + +* **ESCAPED BY** -* **rowFormat** + Used for escape mechanism. - xxx. +* **RECORDREADER** -* **recordWriter** + Specifies a custom RecordReader for one table. - xxx. +* **RECORDWRITER** -* **recordReader** + Specifies a custom RecordWriter for one table. - xxx. - -* **identifierSeq** - - xxx. - -* **recordReader** - - xxx. +* **recordReader_class** + + Specifies a fully-qualified class name of a custom RecordReader. + Default value is `org.apache.hadoop.hive.ql.exec.TextRecordReader` + +* **recordWriter_class** + + Specifies a fully-qualified class name of a custom RecordWriter. + Default value is `org.apache.hadoop.hive.ql.exec.TextRecordWriter`. + +* **script** + + Specify a command to process data. + +* **boolean_expression** + + Specifies any expression that evaluates to a result type `boolean`. Two or + more expressions may be combined together using the logical + operators ( `AND`, `OR` ). + +* **expression** + + Specifies combination of one or more values, operators and SQL functions that results in a value. ### Without Hive support Mode @@ -136,7 +162,7 @@ transformClause: ### Schema-less Script Transforms -If there is no AS clause after USING my_script, Spark assumes that the output of the script contains 2 parts: +If there don't have AS clause after USING my_script, Spark assumes that the output of the script contains 2 parts: 1. key: which is before the first tab, 2. value: which is the rest after the first tab. From 488aed45a7aae74ba93c583a3284eea4b25eb1a6 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 1 Apr 2021 16:43:15 +0800 Subject: [PATCH 03/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 108 +++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 3406bd1984e0..c025c1b09e41 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -158,8 +158,15 @@ transformClause: ### Without Hive support Mode +Now Spark Script transform can run without `-Phive` or `SparkSession.builder.enableHiveSupport()`. +In this case, now we only use script transform with `ROW FORMAT DELIMIT` and treat all value passed +to script as string. + ### With Hive Support Mode +When build Spark with `-Phive` and start Spark SQL with `enableHiveSupport()`, we can use script +transform with Hive SerDe and both `ROW FORMAT DELIMIT`. + ### Schema-less Script Transforms If there don't have AS clause after USING my_script, Spark assumes that the output of the script contains 2 parts: @@ -173,7 +180,106 @@ between the first tab and the second tab if there are multiple tabs. ### Examples ```sql - +CREATE TABLE person (zip_code INT, name STRING, age INT); +INSERT INTO person VALUES + (94588, 'Zen Hui', 50), + (94588, 'Dan Li', 18), + (94588, 'Anil K', 27), + (94588, 'John V', NULL), + (94511, 'David K', 42), + (94511, 'Aryan B.', 18), + (94511, 'Lalit B.', NULL); + +-- With specified out put without data type +SELECT TRANSFORM(zip_code, name, age) + USING 'cat' AS (a, b, c) +FROM person +WHERE zip_code > 94511; ++-------+---------+-----+ +| a | b| c| ++-------+---------+-----+ +| 94588| Anil K| 27| +| 94588| John V| NULL| +| 94588| Zen Hui| 50| +| 94588| Dan Li| 18| ++-------+---------+-----+ + +-- With specified out put without data type +SELECT TRANSFORM(zip_code, name, age) + USING 'cat' AS (a STRING, b STRING, c STRING) +FROM person +WHERE zip_code > 94511; ++-------+---------+-----+ +| a | b| c| ++-------+---------+-----+ +| 94588| Anil K| 27| +| 94588| John V| NULL| +| 94588| Zen Hui| 50| +| 94588| Dan Li| 18| ++-------+---------+-----+ + +-- ROW FORMAT DELIMIT +SELECT TRANSFORM(name, age) + ROW FORMAT DELIMITED + FIELDS TERMINATED BY ',' + LINES TERMINATED BY '\n' + NULL DEFINED AS 'NULL' + USING 'cat' AS (name_age string) + ROW FORMAT DELIMITED + FIELDS TERMINATED BY '@' + LINES TERMINATED BY '\n' + NULL DEFINED AS 'NULL' +FROM person; ++---------------+ +| name_age| ++---------------+ +| Anil K,27| +| John V,null| +| ryan B.,18| +| David K,42| +| Zen Hui,50| +| Dan Li,18| +| Lalit B.,null| ++---------------+ + +-- Hive Serde +SELECT TRANSFORM(zip_code, name, age) + ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + WITH SERDEPROPERTIES ( + 'field.delim' = '\t' + ) + USING 'cat' AS (a STRING, b STRING, c STRING) + ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + WITH SERDEPROPERTIES ( + 'field.delim' = '\t' + ) +FROM person +WHERE zip_code > 94511; ++-------+---------+-----+ +| a | b| c| ++-------+---------+-----+ +| 94588| Anil K| 27| +| 94588| John V| NULL| +| 94588| Zen Hui| 50| +| 94588| Dan Li| 18| ++-------+---------+-----+ + +-- Schema-less mode +SELECT TRANSFORM(zip_code, name, age) + USING 'cat' +FROM person +WHERE zip_code > 94500; ++-------+-----------------+ +| key| value| ++-------+-----------------+ +| 94588| Anil K 27| +| 94588| John V \N| +| 94511| Aryan B. 18| +| 94511| David K 42| +| 94588| Zen Hui 50| +| 94588| Dan Li 18| +| 94511| Lalit B. \N| ++-------+-----------------+ ``` ### Related Statements From a7c61a0613195018b2f504cfdf1e901cee6ddd00 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Fri, 2 Apr 2021 22:41:55 +0800 Subject: [PATCH 04/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 38 ++++++++++----------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index c025c1b09e41..c2625afcf616 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -21,24 +21,24 @@ license: | ### Description -The `TRANSFORM` clause is used to specifies a hive-style transform (SELECT TRANSFORM/MAP/REDUCE) +The `TRANSFORM` clause is used to specifies a Hive-style transform (SELECT TRANSFORM/MAP/REDUCE) query specification to transform the input by forking and running the specified script. Users can plug in their own custom mappers and reducers in the data stream by using features natively supported -in the Spark/Hive language. e.g. in order to run a custom mapper script - map_script - and a custom -reducer script - reduce_script - the user can issue the following command which uses the TRANSFORM +in the Spark/Hive language. e.g. in order to run a custom mapper script `map_script` and a custom +reducer script `reduce_script` the user can issue the following command which uses the TRANSFORM clause to embed the mapper and the reducer scripts. Currently, Spark's script transform support two mode: - 1. Without Hive: It means we run Spark SQL without hive support, in this mode, we can use default format - by treating data as STRING and use Spark's own SerDe. - 2. WIth Hive: It means we run Spark SQL with Hive support, in this mode, when we use default format, - it will be treated as Hive default fomat. And we can use Hive supported SerDe to process data. + 1. Without Hive: It means spark run Spark SQL without Hive support, in this mode, spark can use default + `ROW FORMAT DELIMITED` by treating data as STRING. + 2. WIth Hive: It means spark run Spark SQL with Hive support, in this mode, when spark use default format, + it will be treated as Hive default fomat. And spark can use Hive supported SerDe to process data. -In both mode with default format, columns will be transformed to STRING and delimited by TAB before feeding -to the user script, Similarly, all NULL values will be converted to the literal string \N in order to -differentiate NULL values from empty strings. The standard output of the user script will be treated as -TAB-separated STRING columns, any cell containing only \N will be re-interpreted as a NULL, and then the +In both modes with default format, columns will be transformed to STRING and delimited by tabs before feeding +to the user script, Similarly, all `NULL` values will be converted to the literal string `\N` in order to +differentiate `NULL` values from empty strings. The standard output of the user script will be treated as +TAB-separated STRING columns, any cell containing only `\N` will be re-interpreted as a `NULL`, and then the resulting STRING column will be cast to the data type specified in the table declaration in the usual way. User scripts can output debug information to standard error which will be shown on the task detail page on hadoop. These defaults can be overridden with `ROW FORMAT DELIMITED`. @@ -158,23 +158,23 @@ transformClause: ### Without Hive support Mode -Now Spark Script transform can run without `-Phive` or `SparkSession.builder.enableHiveSupport()`. -In this case, now we only use script transform with `ROW FORMAT DELIMIT` and treat all value passed -to script as string. +Spark scripts transform can run without `-Phive` or `SparkSession.builder.enableHiveSupport()`. +In this case, now spark only use script transform with `ROW FORMAT DELIMITED` and treat all value passed +to script as a string. ### With Hive Support Mode -When build Spark with `-Phive` and start Spark SQL with `enableHiveSupport()`, we can use script -transform with Hive SerDe and both `ROW FORMAT DELIMIT`. +When built Spark with `-Phive` and started Spark SQL with `enableHiveSupport()`, spark can use script +transform with Hive SerDe and both `ROW FORMAT DELIMITED`. ### Schema-less Script Transforms -If there don't have AS clause after USING my_script, Spark assumes that the output of the script contains 2 parts: +If there is no AS clause after USING my_script, Spark assumes that the output of the script contains 2 parts: 1. key: which is before the first tab, 2. value: which is the rest after the first tab. -Note that this is different from specifying AS key, value because in that case, value will only contain the portion +Note that this is different from specifying an AS `key, value` because in that case, the value will only contain the portion between the first tab and the second tab if there are multiple tabs. ### Examples @@ -218,7 +218,7 @@ WHERE zip_code > 94511; | 94588| Dan Li| 18| +-------+---------+-----+ --- ROW FORMAT DELIMIT +-- ROW FORMAT DELIMITED SELECT TRANSFORM(name, age) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' From 8db29cb2ae8940dda33365476a0383714324ab77 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Sat, 3 Apr 2021 00:44:00 +0800 Subject: [PATCH 05/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 112 +++++++------------- 1 file changed, 40 insertions(+), 72 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index c2625afcf616..77f68014c584 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -22,20 +22,13 @@ license: | ### Description The `TRANSFORM` clause is used to specifies a Hive-style transform (SELECT TRANSFORM/MAP/REDUCE) -query specification to transform the input by forking and running the specified script. Users can +query specification to transform the input by running a specified script. Users can plug in their own custom mappers and reducers in the data stream by using features natively supported -in the Spark/Hive language. e.g. in order to run a custom mapper script `map_script` and a custom +in the Spark SQL. e.g. in order to run a custom mapper script `map_script` and a custom reducer script `reduce_script` the user can issue the following command which uses the TRANSFORM clause to embed the mapper and the reducer scripts. -Currently, Spark's script transform support two mode: - - 1. Without Hive: It means spark run Spark SQL without Hive support, in this mode, spark can use default - `ROW FORMAT DELIMITED` by treating data as STRING. - 2. WIth Hive: It means spark run Spark SQL with Hive support, in this mode, when spark use default format, - it will be treated as Hive default fomat. And spark can use Hive supported SerDe to process data. - -In both modes with default format, columns will be transformed to STRING and delimited by tabs before feeding +In default format, columns will be transformed to STRING and delimited by tabs before feeding to the user script, Similarly, all `NULL` values will be converted to the literal string `\N` in order to differentiate `NULL` values from empty strings. The standard output of the user script will be treated as TAB-separated STRING columns, any cell containing only `\N` will be re-interpreted as a `NULL`, and then the @@ -46,30 +39,26 @@ These defaults can be overridden with `ROW FORMAT DELIMITED`. ### Syntax ```sql +SELECT [ TRANSFORM ( namedExpressionSeq ) | MAP namedExpressionSeq | REDUCE namedExpressionSeq ] + [ inRowFormat ] + [ RECORDWRITER recordWriter_class ] + USING script [ AS ( [ col_name [ col_type ]] [ , ... ] ) ] + [ outRowFormat ] + [ RECORDREADER recordReader_class ] + FROM { from_item [ , ... ] } + rowFormat : ROW FORMAT SERDE serde_class [ WITH SERDEPROPERTIES serde_props ] | ROW FORMAT DELIMITED - [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escapedBy ] ] - [ COLLECTION ITEMS TERMINATED BY collectionItemsTerminatedBy ] - [ MAP KEYS TERMINATED BY keysTerminatedBy ] - [ LINES TERMINATED BY linesSeparatedBy ] - [ NULL DEFINED AS nullDefinedAs ] + [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escapedBy ] ] + [ COLLECTION ITEMS TERMINATED BY collectionItemsTerminatedBy ] + [ MAP KEYS TERMINATED BY keysTerminatedBy ] + [ LINES TERMINATED BY linesSeparatedBy ] + [ NULL DEFINED AS nullDefinedAs ] inRowFormat=rowFormat outRowFormat=rowFormat namedExpressionSeq = named_expression [ , ... ] - -transformClause: - SELECT [ TRANSFORM ( namedExpressionSeq ) | MAP namedExpressionSeq | REDUCE namedExpressionSeq ] - [ inRowFormat ] - [ RECORDWRITER recordWriter_class ] - USING script - [ AS ( [ col_name [ col_type ]] [ , ... ] ) ] - [ outRowFormat ] - [ RECORDREADER recordReader_class ] - [ WHERE boolean_expression ] - [ GROUP BY expression [ , ... ] ] - [ HAVING boolean_expression ] ``` ### Parameters @@ -135,37 +124,16 @@ transformClause: * **recordReader_class** Specifies a fully-qualified class name of a custom RecordReader. - Default value is `org.apache.hadoop.hive.ql.exec.TextRecordReader` + A default value is `org.apache.hadoop.hive.ql.exec.TextRecordReader` * **recordWriter_class** Specifies a fully-qualified class name of a custom RecordWriter. - Default value is `org.apache.hadoop.hive.ql.exec.TextRecordWriter`. + A default value is `org.apache.hadoop.hive.ql.exec.TextRecordWriter`. * **script** - Specify a command to process data. - -* **boolean_expression** - - Specifies any expression that evaluates to a result type `boolean`. Two or - more expressions may be combined together using the logical - operators ( `AND`, `OR` ). - -* **expression** - - Specifies combination of one or more values, operators and SQL functions that results in a value. - -### Without Hive support Mode - -Spark scripts transform can run without `-Phive` or `SparkSession.builder.enableHiveSupport()`. -In this case, now spark only use script transform with `ROW FORMAT DELIMITED` and treat all value passed -to script as a string. - -### With Hive Support Mode - -When built Spark with `-Phive` and started Spark SQL with `enableHiveSupport()`, spark can use script -transform with Hive SerDe and both `ROW FORMAT DELIMITED`. + Specifies a command to process data. ### Schema-less Script Transforms @@ -192,7 +160,7 @@ INSERT INTO person VALUES -- With specified out put without data type SELECT TRANSFORM(zip_code, name, age) - USING 'cat' AS (a, b, c) + USING 'cat' AS (a, b, c) FROM person WHERE zip_code > 94511; +-------+---------+-----+ @@ -206,7 +174,7 @@ WHERE zip_code > 94511; -- With specified out put without data type SELECT TRANSFORM(zip_code, name, age) - USING 'cat' AS (a STRING, b STRING, c STRING) + USING 'cat' AS (a STRING, b STRING, c STRING) FROM person WHERE zip_code > 94511; +-------+---------+-----+ @@ -220,15 +188,15 @@ WHERE zip_code > 94511; -- ROW FORMAT DELIMITED SELECT TRANSFORM(name, age) - ROW FORMAT DELIMITED - FIELDS TERMINATED BY ',' - LINES TERMINATED BY '\n' - NULL DEFINED AS 'NULL' - USING 'cat' AS (name_age string) - ROW FORMAT DELIMITED - FIELDS TERMINATED BY '@' - LINES TERMINATED BY '\n' - NULL DEFINED AS 'NULL' + ROW FORMAT DELIMITED + FIELDS TERMINATED BY ',' + LINES TERMINATED BY '\n' + NULL DEFINED AS 'NULL' + USING 'cat' AS (name_age string) + ROW FORMAT DELIMITED + FIELDS TERMINATED BY '@' + LINES TERMINATED BY '\n' + NULL DEFINED AS 'NULL' FROM person; +---------------+ | name_age| @@ -244,15 +212,15 @@ FROM person; -- Hive Serde SELECT TRANSFORM(zip_code, name, age) - ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' - WITH SERDEPROPERTIES ( - 'field.delim' = '\t' - ) - USING 'cat' AS (a STRING, b STRING, c STRING) - ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' - WITH SERDEPROPERTIES ( - 'field.delim' = '\t' - ) + ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + WITH SERDEPROPERTIES ( + 'field.delim' = '\t' + ) + USING 'cat' AS (a STRING, b STRING, c STRING) + ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + WITH SERDEPROPERTIES ( + 'field.delim' = '\t' + ) FROM person WHERE zip_code > 94511; +-------+---------+-----+ @@ -266,7 +234,7 @@ WHERE zip_code > 94511; -- Schema-less mode SELECT TRANSFORM(zip_code, name, age) - USING 'cat' + USING 'cat' FROM person WHERE zip_code > 94500; +-------+-----------------+ From e59a6866061e8341e2cc0a7c6bc30de9d53e876b Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Sat, 3 Apr 2021 01:02:51 +0800 Subject: [PATCH 06/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 66 ++++++++++----------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 77f68014c584..1a8f6ab450c2 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -28,7 +28,7 @@ in the Spark SQL. e.g. in order to run a custom mapper script `map_script` and a reducer script `reduce_script` the user can issue the following command which uses the TRANSFORM clause to embed the mapper and the reducer scripts. -In default format, columns will be transformed to STRING and delimited by tabs before feeding +In default format, columns will be transformed to `STRING` and delimited by tabs before feeding to the user script, Similarly, all `NULL` values will be converted to the literal string `\N` in order to differentiate `NULL` values from empty strings. The standard output of the user script will be treated as TAB-separated STRING columns, any cell containing only `\N` will be re-interpreted as a `NULL`, and then the @@ -50,11 +50,11 @@ SELECT [ TRANSFORM ( namedExpressionSeq ) | MAP namedExpressionSeq | REDUCE name rowFormat : ROW FORMAT SERDE serde_class [ WITH SERDEPROPERTIES serde_props ] | ROW FORMAT DELIMITED - [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escapedBy ] ] - [ COLLECTION ITEMS TERMINATED BY collectionItemsTerminatedBy ] - [ MAP KEYS TERMINATED BY keysTerminatedBy ] - [ LINES TERMINATED BY linesSeparatedBy ] - [ NULL DEFINED AS nullDefinedAs ] + [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escapedBy ] ] + [ COLLECTION ITEMS TERMINATED BY collectionItemsTerminatedBy ] + [ MAP KEYS TERMINATED BY keysTerminatedBy ] + [ LINES TERMINATED BY linesSeparatedBy ] + [ NULL DEFINED AS nullDefinedAs ] inRowFormat=rowFormat outRowFormat=rowFormat @@ -123,13 +123,11 @@ namedExpressionSeq = named_expression [ , ... ] * **recordReader_class** - Specifies a fully-qualified class name of a custom RecordReader. - A default value is `org.apache.hadoop.hive.ql.exec.TextRecordReader` + Specifies a fully-qualified class name of a custom RecordReader. A default value is `org.apache.hadoop.hive.ql.exec.TextRecordReader`. * **recordWriter_class** - Specifies a fully-qualified class name of a custom RecordWriter. - A default value is `org.apache.hadoop.hive.ql.exec.TextRecordWriter`. + Specifies a fully-qualified class name of a custom RecordWriter. A default value is `org.apache.hadoop.hive.ql.exec.TextRecordWriter`. * **script** @@ -158,7 +156,7 @@ INSERT INTO person VALUES (94511, 'Aryan B.', 18), (94511, 'Lalit B.', NULL); --- With specified out put without data type +-- With specified output without data type SELECT TRANSFORM(zip_code, name, age) USING 'cat' AS (a, b, c) FROM person @@ -166,13 +164,13 @@ WHERE zip_code > 94511; +-------+---------+-----+ | a | b| c| +-------+---------+-----+ -| 94588| Anil K| 27| -| 94588| John V| NULL| -| 94588| Zen Hui| 50| +| 94588| Anil K| 27| +| 94588| John V| NULL| +| 94588| Zen Hui| 50| | 94588| Dan Li| 18| +-------+---------+-----+ --- With specified out put without data type +-- With specified output with data type SELECT TRANSFORM(zip_code, name, age) USING 'cat' AS (a STRING, b STRING, c STRING) FROM person @@ -180,13 +178,13 @@ WHERE zip_code > 94511; +-------+---------+-----+ | a | b| c| +-------+---------+-----+ -| 94588| Anil K| 27| -| 94588| John V| NULL| -| 94588| Zen Hui| 50| +| 94588| Anil K| 27| +| 94588| John V| NULL| +| 94588| Zen Hui| 50| | 94588| Dan Li| 18| +-------+---------+-----+ --- ROW FORMAT DELIMITED +-- Using ROW FORMAT DELIMITED SELECT TRANSFORM(name, age) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' @@ -210,7 +208,7 @@ FROM person; | Lalit B.,null| +---------------+ --- Hive Serde +-- Using Hive Serde SELECT TRANSFORM(zip_code, name, age) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' WITH SERDEPROPERTIES ( @@ -226,9 +224,9 @@ WHERE zip_code > 94511; +-------+---------+-----+ | a | b| c| +-------+---------+-----+ -| 94588| Anil K| 27| -| 94588| John V| NULL| -| 94588| Zen Hui| 50| +| 94588| Anil K| 27| +| 94588| John V| NULL| +| 94588| Zen Hui| 50| | 94588| Dan Li| 18| +-------+---------+-----+ @@ -237,17 +235,17 @@ SELECT TRANSFORM(zip_code, name, age) USING 'cat' FROM person WHERE zip_code > 94500; -+-------+-----------------+ -| key| value| -+-------+-----------------+ -| 94588| Anil K 27| -| 94588| John V \N| -| 94511| Aryan B. 18| -| 94511| David K 42| -| 94588| Zen Hui 50| -| 94588| Dan Li 18| -| 94511| Lalit B. \N| -+-------+-----------------+ ++-------+---------------------+ +| key| value| ++-------+---------------------+ +| 94588| Anil K 27| +| 94588| John V \N| +| 94511| Aryan B. 18| +| 94511| David K 42| +| 94588| Zen Hui 50| +| 94588| Dan Li 18| +| 94511| Lalit B. \N| ++-------+---------------------+ ``` ### Related Statements From e0ce6a52beea5d17626b18a06da06430e65afcda Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 8 Apr 2021 19:36:16 +0800 Subject: [PATCH 07/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 65 ++++++++++----------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 1a8f6ab450c2..d0da13f23f90 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -21,44 +21,33 @@ license: | ### Description -The `TRANSFORM` clause is used to specifies a Hive-style transform (SELECT TRANSFORM/MAP/REDUCE) +The `TRANSFORM` clause is used to specify a Hive-style transform (`SELECT TRANSFORM`/`MAP`/`REDUCE`) query specification to transform the input by running a specified script. Users can -plug in their own custom mappers and reducers in the data stream by using features natively supported -in the Spark SQL. e.g. in order to run a custom mapper script `map_script` and a custom -reducer script `reduce_script` the user can issue the following command which uses the TRANSFORM -clause to embed the mapper and the reducer scripts. - -In default format, columns will be transformed to `STRING` and delimited by tabs before feeding -to the user script, Similarly, all `NULL` values will be converted to the literal string `\N` in order to -differentiate `NULL` values from empty strings. The standard output of the user script will be treated as -TAB-separated STRING columns, any cell containing only `\N` will be re-interpreted as a `NULL`, and then the -resulting STRING column will be cast to the data type specified in the table declaration in the usual way. -User scripts can output debug information to standard error which will be shown on the task detail page on hadoop. -These defaults can be overridden with `ROW FORMAT DELIMITED`. +plug in their own custom mappers or reducers in the data stream by using features natively supported +in the Spark SQL. In order to run a custom mapper script `map_script` or a custom +reducer script `reduce_script` the user can issue the command which uses the `TRANSFORM` +clause to embed the mapper or the reducer scripts. ### Syntax ```sql -SELECT [ TRANSFORM ( namedExpressionSeq ) | MAP namedExpressionSeq | REDUCE namedExpressionSeq ] - [ inRowFormat ] +SELECT { TRANSFORM ( named_expression [ , ... ] ) | MAP named_expression [ , ... ] | REDUCE named_expression [ , ... ] } + [ rowFormat ] [ RECORDWRITER recordWriter_class ] USING script [ AS ( [ col_name [ col_type ]] [ , ... ] ) ] - [ outRowFormat ] + [ rowFormat ] [ RECORDREADER recordReader_class ] - FROM { from_item [ , ... ] } - -rowFormat - : ROW FORMAT SERDE serde_class [ WITH SERDEPROPERTIES serde_props ] - | ROW FORMAT DELIMITED - [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escapedBy ] ] - [ COLLECTION ITEMS TERMINATED BY collectionItemsTerminatedBy ] - [ MAP KEYS TERMINATED BY keysTerminatedBy ] - [ LINES TERMINATED BY linesSeparatedBy ] - [ NULL DEFINED AS nullDefinedAs ] - -inRowFormat=rowFormat -outRowFormat=rowFormat -namedExpressionSeq = named_expression [ , ... ] +``` + +While `rowFormat` are defined as +```sql +{ ROW FORMAT SERDE serde_class [ WITH SERDEPROPERTIES serde_props ] | +ROW FORMAT DELIMITED + [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escapedBy ] ] + [ COLLECTION ITEMS TERMINATED BY collectionItemsTerminatedBy ] + [ MAP KEYS TERMINATED BY keysTerminatedBy ] + [ LINES TERMINATED BY linesSeparatedBy ] + [ NULL DEFINED AS nullDefinedAs ] } ``` ### Parameters @@ -133,14 +122,24 @@ namedExpressionSeq = named_expression [ , ... ] Specifies a command to process data. +### SERDE behavior + +In default format we use Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe`, columns will be transformed to `STRING` and delimited by tabs before feeding +to the user script. Similarly, all `NULL` values will be converted to the literal string `"\N"` in order to +differentiate `NULL` values from empty strings. The standard output of the user script will be treated as +TAB-separated STRING columns, any cell containing only `"\N"` will be re-interpreted as a `NULL`, and then the +resulting STRING column will be cast to the data type specified in the table declaration in the usual way. +User scripts can output debug information to standard error which will be shown on the task detail page on Spark. +These defaults can be overridden with `ROW FORMAT SERDE` or `ROW FORMAT DELIMITED`. + ### Schema-less Script Transforms -If there is no AS clause after USING my_script, Spark assumes that the output of the script contains 2 parts: +If there is no `AS` clause after `USING my_script`, Spark assumes that the output of the script contains 2 parts: - 1. key: which is before the first tab, + 1. key: which is before the first tab. 2. value: which is the rest after the first tab. -Note that this is different from specifying an AS `key, value` because in that case, the value will only contain the portion +Note that this is different from specifying an `AS key, value` because in that case, the value will only contain the portion between the first tab and the second tab if there are multiple tabs. ### Examples From f7f895224a803693dc300aa757431edcc66d18f7 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 8 Apr 2021 19:37:46 +0800 Subject: [PATCH 08/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index d0da13f23f90..20c751dc38e2 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -124,13 +124,13 @@ ROW FORMAT DELIMITED ### SERDE behavior -In default format we use Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe`, columns will be transformed to `STRING` and delimited by tabs before feeding -to the user script. Similarly, all `NULL` values will be converted to the literal string `"\N"` in order to -differentiate `NULL` values from empty strings. The standard output of the user script will be treated as -TAB-separated STRING columns, any cell containing only `"\N"` will be re-interpreted as a `NULL`, and then the -resulting STRING column will be cast to the data type specified in the table declaration in the usual way. -User scripts can output debug information to standard error which will be shown on the task detail page on Spark. -These defaults can be overridden with `ROW FORMAT SERDE` or `ROW FORMAT DELIMITED`. +In default format we use Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe`, columns will be transformed +to `STRING` and delimited by tabs before feeding to the user script. Similarly, all `NULL` values will be converted +to the literal string `"\N"` in order to differentiate `NULL` values from empty strings. The standard output of the +user script will be treated as TAB-separated STRING columns, any cell containing only `"\N"` will be re-interpreted +as a `NULL`, and then the resulting STRING column will be cast to the data type specified in the table declaration +in the usual way. User scripts can output debug information to standard error which will be shown on the task detail +page on Spark. These defaults can be overridden with `ROW FORMAT SERDE` or `ROW FORMAT DELIMITED`. ### Schema-less Script Transforms From 5da3676edac4fc590a50da6eff9770f6e41d54a9 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 8 Apr 2021 22:38:52 +0800 Subject: [PATCH 09/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 20c751dc38e2..babdec3fa74f 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -34,7 +34,7 @@ clause to embed the mapper or the reducer scripts. SELECT { TRANSFORM ( named_expression [ , ... ] ) | MAP named_expression [ , ... ] | REDUCE named_expression [ , ... ] } [ rowFormat ] [ RECORDWRITER recordWriter_class ] - USING script [ AS ( [ col_name [ col_type ]] [ , ... ] ) ] + USING script [ AS ( [ col_name [ col_type ] ] [ , ... ] ) ] [ rowFormat ] [ RECORDREADER recordReader_class ] ``` @@ -42,12 +42,12 @@ SELECT { TRANSFORM ( named_expression [ , ... ] ) | MAP named_expression [ , ... While `rowFormat` are defined as ```sql { ROW FORMAT SERDE serde_class [ WITH SERDEPROPERTIES serde_props ] | -ROW FORMAT DELIMITED - [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escapedBy ] ] - [ COLLECTION ITEMS TERMINATED BY collectionItemsTerminatedBy ] - [ MAP KEYS TERMINATED BY keysTerminatedBy ] - [ LINES TERMINATED BY linesSeparatedBy ] - [ NULL DEFINED AS nullDefinedAs ] } + ROW FORMAT DELIMITED + [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escapedBy ] ] + [ COLLECTION ITEMS TERMINATED BY collectionItemsTerminatedBy ] + [ MAP KEYS TERMINATED BY keysTerminatedBy ] + [ LINES TERMINATED BY linesSeparatedBy ] + [ NULL DEFINED AS nullDefinedAs ] } ``` ### Parameters @@ -60,7 +60,7 @@ ROW FORMAT DELIMITED * **row_format** - Use the `SERDE` clause to specify a custom SerDe for one table. Otherwise, use the `DELIMITED` clause to use the native SerDe and specify the delimiter, escape character, null character and so on. + Spark uses the `SERDE` clause to specify a custom SerDe for one table. Otherwise, use the `DELIMITED` clause to use the native SerDe and specify the delimiter, escape character, null character and so on. * **SERDE** @@ -122,9 +122,9 @@ ROW FORMAT DELIMITED Specifies a command to process data. -### SERDE behavior +### Serde behavior -In default format we use Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe`, columns will be transformed +Spark uses Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe` by default, columns will be transformed to `STRING` and delimited by tabs before feeding to the user script. Similarly, all `NULL` values will be converted to the literal string `"\N"` in order to differentiate `NULL` values from empty strings. The standard output of the user script will be treated as TAB-separated STRING columns, any cell containing only `"\N"` will be re-interpreted From f6590afe5db02e21eec5630e0d32d6e7ed54c596 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Mon, 12 Apr 2021 22:10:19 +0800 Subject: [PATCH 10/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 24 +++++++-------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index babdec3fa74f..2c22a6c1aa3c 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -62,22 +62,14 @@ While `rowFormat` are defined as Spark uses the `SERDE` clause to specify a custom SerDe for one table. Otherwise, use the `DELIMITED` clause to use the native SerDe and specify the delimiter, escape character, null character and so on. -* **SERDE** - - Specifies a custom SerDe for one table. - * **serde_class** Specifies a fully-qualified class name of a custom SerDe. -* **SERDEPROPERTIES** +* **serde_props** A list of key-value pairs that is used to tag the SerDe definition. -* **DELIMITED** - - The `DELIMITED` clause can be used to specify the native SerDe and state the delimiter, escape character, null character and so on. - * **FIELDS TERMINATED BY** Used to define a column separator. @@ -125,22 +117,22 @@ While `rowFormat` are defined as ### Serde behavior Spark uses Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe` by default, columns will be transformed -to `STRING` and delimited by tabs before feeding to the user script. Similarly, all `NULL` values will be converted +to `STRING` and combined by tabs before feeding to the user script. Similarly, all `NULL` values will be converted to the literal string `"\N"` in order to differentiate `NULL` values from empty strings. The standard output of the user script will be treated as TAB-separated STRING columns, any cell containing only `"\N"` will be re-interpreted as a `NULL`, and then the resulting STRING column will be cast to the data type specified in the table declaration -in the usual way. User scripts can output debug information to standard error which will be shown on the task detail -page on Spark. These defaults can be overridden with `ROW FORMAT SERDE` or `ROW FORMAT DELIMITED`. - -### Schema-less Script Transforms - -If there is no `AS` clause after `USING my_script`, Spark assumes that the output of the script contains 2 parts: +in the usual way. If the actual number of output columns is less than the number of specified output columns, +insufficient output columns will be supplemented with `NULL`. If the actual number of output columns is more than the +number of specified output columns, the output columns will only select the corresponding columns and the remaining part +will be discarded. If there is no `AS` clause after `USING my_script`, Spark assumes that the output of the script contains 2 parts: 1. key: which is before the first tab. 2. value: which is the rest after the first tab. Note that this is different from specifying an `AS key, value` because in that case, the value will only contain the portion between the first tab and the second tab if there are multiple tabs. +User scripts can output debug information to standard error which will be shown on the task detail +page on Spark. These defaults can be overridden with `ROW FORMAT SERDE` or `ROW FORMAT DELIMITED`. ### Examples From 0dc289c75029e35b5aa5f508770880bfe2c2c52c Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Mon, 12 Apr 2021 22:12:28 +0800 Subject: [PATCH 11/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 2c22a6c1aa3c..8c0e343723bf 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -117,7 +117,7 @@ While `rowFormat` are defined as ### Serde behavior Spark uses Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe` by default, columns will be transformed -to `STRING` and combined by tabs before feeding to the user script. Similarly, all `NULL` values will be converted +to `STRING` and combined by tabs before feeding to the user script. All `NULL` values will be converted to the literal string `"\N"` in order to differentiate `NULL` values from empty strings. The standard output of the user script will be treated as TAB-separated STRING columns, any cell containing only `"\N"` will be re-interpreted as a `NULL`, and then the resulting STRING column will be cast to the data type specified in the table declaration From 0d60cb129a6d4050a1dc14de700627b8ca3f225e Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 13 Apr 2021 10:38:01 +0800 Subject: [PATCH 12/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 42 ++++++++------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 8c0e343723bf..482b203ba3b8 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -32,22 +32,22 @@ clause to embed the mapper or the reducer scripts. ```sql SELECT { TRANSFORM ( named_expression [ , ... ] ) | MAP named_expression [ , ... ] | REDUCE named_expression [ , ... ] } - [ rowFormat ] - [ RECORDWRITER recordWriter_class ] + [ ROW FORMAT row_format ] + [ RECORDWRITER record_writer_class ] USING script [ AS ( [ col_name [ col_type ] ] [ , ... ] ) ] - [ rowFormat ] - [ RECORDREADER recordReader_class ] + [ ROW FORMAT row_format ] + [ RECORDREADER record_reader_class ] ``` -While `rowFormat` are defined as +While `row_format` are defined as ```sql -{ ROW FORMAT SERDE serde_class [ WITH SERDEPROPERTIES serde_props ] | - ROW FORMAT DELIMITED - [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escapedBy ] ] - [ COLLECTION ITEMS TERMINATED BY collectionItemsTerminatedBy ] - [ MAP KEYS TERMINATED BY keysTerminatedBy ] - [ LINES TERMINATED BY linesSeparatedBy ] - [ NULL DEFINED AS nullDefinedAs ] } +{ SERDE serde_class [ WITH SERDEPROPERTIES (k1=v1, k2=v2, ... ) ] | + DELIMITED + [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escaped_char ] ] + [ COLLECTION ITEMS TERMINATED BY collection_items_terminated_char ] + [ MAP KEYS TERMINATED BY map_key_terminated_char ] + [ LINES TERMINATED BY row_terminated_char ] + [ NULL DEFINED AS null_char ] } ``` ### Parameters @@ -66,10 +66,6 @@ While `rowFormat` are defined as Specifies a fully-qualified class name of a custom SerDe. -* **serde_props** - - A list of key-value pairs that is used to tag the SerDe definition. - * **FIELDS TERMINATED BY** Used to define a column separator. @@ -94,27 +90,19 @@ While `rowFormat` are defined as Used for escape mechanism. -* **RECORDREADER** - - Specifies a custom RecordReader for one table. - * **RECORDWRITER** - Specifies a custom RecordWriter for one table. + Specifies a fully-qualified class name of a custom RecordWriter. A default value is `org.apache.hadoop.hive.ql.exec.TextRecordWriter`. -* **recordReader_class** +* **RECORDREADER** Specifies a fully-qualified class name of a custom RecordReader. A default value is `org.apache.hadoop.hive.ql.exec.TextRecordReader`. -* **recordWriter_class** - - Specifies a fully-qualified class name of a custom RecordWriter. A default value is `org.apache.hadoop.hive.ql.exec.TextRecordWriter`. - * **script** Specifies a command to process data. -### Serde behavior +### SerDe behavior Spark uses Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe` by default, columns will be transformed to `STRING` and combined by tabs before feeding to the user script. All `NULL` values will be converted From b636dd1bdf705d00189a2670be32c52e92baabb9 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 13 Apr 2021 10:51:41 +0800 Subject: [PATCH 13/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 482b203ba3b8..5a8a2b40fcd6 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -62,10 +62,18 @@ While `row_format` are defined as Spark uses the `SERDE` clause to specify a custom SerDe for one table. Otherwise, use the `DELIMITED` clause to use the native SerDe and specify the delimiter, escape character, null character and so on. +* **SERDE** + + Specifies a custom SerDe for one table. + * **serde_class** Specifies a fully-qualified class name of a custom SerDe. +* **DELIMITED** + + The `DELIMITED` clause can be used to specify the native SerDe and state the delimiter, escape character, null character and so on. + * **FIELDS TERMINATED BY** Used to define a column separator. From a26f61eb173a607e1983aef6c31cb1abf72fd0cb Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 13 Apr 2021 10:55:02 +0800 Subject: [PATCH 14/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 5a8a2b40fcd6..8e87d8d62bc5 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -37,17 +37,14 @@ SELECT { TRANSFORM ( named_expression [ , ... ] ) | MAP named_expression [ , ... USING script [ AS ( [ col_name [ col_type ] ] [ , ... ] ) ] [ ROW FORMAT row_format ] [ RECORDREADER record_reader_class ] -``` -While `row_format` are defined as -```sql -{ SERDE serde_class [ WITH SERDEPROPERTIES (k1=v1, k2=v2, ... ) ] | - DELIMITED - [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escaped_char ] ] - [ COLLECTION ITEMS TERMINATED BY collection_items_terminated_char ] +row_format: + : SERDE serde_class [ WITH SERDEPROPERTIES (k1=v1, k2=v2, ... ) ] + | DELIMITED [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escaped_char ] ] + [ COLLECTION ITEMS TERMINATED BY collection_items_terminated_char ] [ MAP KEYS TERMINATED BY map_key_terminated_char ] [ LINES TERMINATED BY row_terminated_char ] - [ NULL DEFINED AS null_char ] } + [ NULL DEFINED AS null_char ] ``` ### Parameters From 108271015b84abb5bfda37b9546bfe3b138f43a9 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 15 Apr 2021 11:19:48 +0800 Subject: [PATCH 15/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 8e87d8d62bc5..5eeb9d2df24f 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -31,7 +31,7 @@ clause to embed the mapper or the reducer scripts. ### Syntax ```sql -SELECT { TRANSFORM ( named_expression [ , ... ] ) | MAP named_expression [ , ... ] | REDUCE named_expression [ , ... ] } +{ SELECT TRANSFORM ( named_expression [ , ... ] ) | { MAP | REDUCE } named_expression [ , ... ] } [ ROW FORMAT row_format ] [ RECORDWRITER record_writer_class ] USING script [ AS ( [ col_name [ col_type ] ] [ , ... ] ) ] From 89eee470d9f8cb562ad3a2ffda5edaa29f096f96 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 15 Apr 2021 17:35:38 +0800 Subject: [PATCH 16/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 5eeb9d2df24f..2e09bebc30f7 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -21,17 +21,14 @@ license: | ### Description -The `TRANSFORM` clause is used to specify a Hive-style transform (`SELECT TRANSFORM`/`MAP`/`REDUCE`) -query specification to transform the input by running a specified script. Users can -plug in their own custom mappers or reducers in the data stream by using features natively supported -in the Spark SQL. In order to run a custom mapper script `map_script` or a custom -reducer script `reduce_script` the user can issue the command which uses the `TRANSFORM` -clause to embed the mapper or the reducer scripts. +The `TRANSFORM` clause is used to specify a Hive-style transform query specification +to transform the inputs by running a specified script. Users can plug in their own custom +scripts in the data stream by using features natively supported in the Spark SQL. ### Syntax ```sql -{ SELECT TRANSFORM ( named_expression [ , ... ] ) | { MAP | REDUCE } named_expression [ , ... ] } +SELECT TRANSFORM ( named_expression [ , ... ] ) [ ROW FORMAT row_format ] [ RECORDWRITER record_writer_class ] USING script [ AS ( [ col_name [ col_type ] ] [ , ... ] ) ] From 4807201e30c833f3d7123daf51afaea1bb836ae4 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 15 Apr 2021 18:46:05 +0800 Subject: [PATCH 17/25] follow comment --- docs/sql-ref-syntax-qry-select-transform.md | 10 +++++----- docs/sql-ref-syntax-qry-select.md | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 2e09bebc30f7..c7ffd20d897a 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -23,7 +23,7 @@ license: | The `TRANSFORM` clause is used to specify a Hive-style transform query specification to transform the inputs by running a specified script. Users can plug in their own custom -scripts in the data stream by using features natively supported in the Spark SQL. +scripts in the data stream by using `TRANSFORM` clause. ### Syntax @@ -94,19 +94,19 @@ row_format: * **RECORDWRITER** - Specifies a fully-qualified class name of a custom RecordWriter. A default value is `org.apache.hadoop.hive.ql.exec.TextRecordWriter`. + Specifies a fully-qualified class name of a custom RecordWriter. The default value is `org.apache.hadoop.hive.ql.exec.TextRecordWriter`. * **RECORDREADER** - Specifies a fully-qualified class name of a custom RecordReader. A default value is `org.apache.hadoop.hive.ql.exec.TextRecordReader`. + Specifies a fully-qualified class name of a custom RecordReader. The default value is `org.apache.hadoop.hive.ql.exec.TextRecordReader`. * **script** - Specifies a command to process data. + Specifies a command or a path to script to process data. ### SerDe behavior -Spark uses Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe` by default, columns will be transformed +Spark uses Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe` by default, columns will be casted to `STRING` and combined by tabs before feeding to the user script. All `NULL` values will be converted to the literal string `"\N"` in order to differentiate `NULL` values from empty strings. The standard output of the user script will be treated as TAB-separated STRING columns, any cell containing only `"\N"` will be re-interpreted diff --git a/docs/sql-ref-syntax-qry-select.md b/docs/sql-ref-syntax-qry-select.md index 27b88c1231d2..4ad4da9ae26a 100644 --- a/docs/sql-ref-syntax-qry-select.md +++ b/docs/sql-ref-syntax-qry-select.md @@ -41,7 +41,7 @@ select_statement [ { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select_stat While `select_statement` is defined as ```sql -SELECT [ hints , ... ] [ ALL | DISTINCT ] { [[ named_expression | regex_column_names ] [ , ... ] | TRANSFORM Clause ] } +SELECT [ hints , ... ] [ ALL | DISTINCT ] { [[ named_expression | regex_column_names ] [ , ... ] | TRANSFORM ( named_expression [ , ... ] )) ] } FROM { from_item [ , ... ] } [ PIVOT clause ] [ LATERAL VIEW clause ] [ ... ] @@ -166,7 +166,7 @@ SELECT [ hints , ... ] [ ALL | DISTINCT ] { [[ named_expression | regex_column_n * **TRANSFORM** - Specifies a hive-style transform (SELECT TRANSFORM/MAP/REDUCE) query specification to transform + Specifies a hive-style transform (`SELECT TRANSFORM`) query specification to transform the input by forking and running the specified script. ### Related Statements From 8a11d90646fe264f070e6b3dc885a7dc8f0cb673 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 15 Apr 2021 21:26:47 +0800 Subject: [PATCH 18/25] follow comment --- docs/sql-ref-syntax-qry-select-transform.md | 10 ++++------ docs/sql-ref-syntax-qry-select.md | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index c7ffd20d897a..eaf0839c0497 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -28,7 +28,7 @@ scripts in the data stream by using `TRANSFORM` clause. ### Syntax ```sql -SELECT TRANSFORM ( named_expression [ , ... ] ) +SELECT TRANSFORM ( expression [ , ... ] ) [ ROW FORMAT row_format ] [ RECORDWRITER record_writer_class ] USING script [ AS ( [ col_name [ col_type ] ] [ , ... ] ) ] @@ -46,11 +46,9 @@ row_format: ### Parameters -* **named_expression** - - An expression with an assigned name. In general, it denotes a column expression. - - **Syntax:** `expression [AS] [alias]` +* **expression** + + Specifies a combination of one or more values, operators and SQL functions that results in a value. * **row_format** diff --git a/docs/sql-ref-syntax-qry-select.md b/docs/sql-ref-syntax-qry-select.md index 4ad4da9ae26a..2d15667c4ae2 100644 --- a/docs/sql-ref-syntax-qry-select.md +++ b/docs/sql-ref-syntax-qry-select.md @@ -41,7 +41,7 @@ select_statement [ { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select_stat While `select_statement` is defined as ```sql -SELECT [ hints , ... ] [ ALL | DISTINCT ] { [[ named_expression | regex_column_names ] [ , ... ] | TRANSFORM ( named_expression [ , ... ] )) ] } +SELECT [ hints , ... ] [ ALL | DISTINCT ] { [[ named_expression | regex_column_names ] [ , ... ] | TRANSFORM ( expression [ , ... ] )) ] } FROM { from_item [ , ... ] } [ PIVOT clause ] [ LATERAL VIEW clause ] [ ... ] From 9b7f66dcb3ec7641bc75a54975db529fb07013b5 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 15 Apr 2021 21:54:45 +0800 Subject: [PATCH 19/25] update --- docs/sql-ref-syntax-qry-select-transform.md | 8 ++++---- docs/sql-ref-syntax-qry-select.md | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index eaf0839c0497..b060f7ed3445 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -22,8 +22,8 @@ license: | ### Description The `TRANSFORM` clause is used to specify a Hive-style transform query specification -to transform the inputs by running a specified script. Users can plug in their own custom -scripts in the data stream by using `TRANSFORM` clause. +to transform the inputs by running a specified command or script. Users can plug in their own custom +command or script in the data stream by using `TRANSFORM` clause. ### Syntax @@ -31,7 +31,7 @@ scripts in the data stream by using `TRANSFORM` clause. SELECT TRANSFORM ( expression [ , ... ] ) [ ROW FORMAT row_format ] [ RECORDWRITER record_writer_class ] - USING script [ AS ( [ col_name [ col_type ] ] [ , ... ] ) ] + USING command_or_script [ AS ( [ col_name [ col_type ] ] [ , ... ] ) ] [ ROW FORMAT row_format ] [ RECORDREADER record_reader_class ] @@ -98,7 +98,7 @@ row_format: Specifies a fully-qualified class name of a custom RecordReader. The default value is `org.apache.hadoop.hive.ql.exec.TextRecordReader`. -* **script** +* **command_or_script** Specifies a command or a path to script to process data. diff --git a/docs/sql-ref-syntax-qry-select.md b/docs/sql-ref-syntax-qry-select.md index 2d15667c4ae2..83669c045614 100644 --- a/docs/sql-ref-syntax-qry-select.md +++ b/docs/sql-ref-syntax-qry-select.md @@ -166,8 +166,7 @@ SELECT [ hints , ... ] [ ALL | DISTINCT ] { [[ named_expression | regex_column_n * **TRANSFORM** - Specifies a hive-style transform (`SELECT TRANSFORM`) query specification to transform - the input by forking and running the specified script. + Specifies a hive-style transform query specification to transform the input by forking and running user-specified command or script. ### Related Statements From 8650461f2e4cfc08aaf2a468accc7faf59f5f7da Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 15 Apr 2021 21:55:36 +0800 Subject: [PATCH 20/25] Update sql-ref-syntax-qry-select.md --- docs/sql-ref-syntax-qry-select.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-ref-syntax-qry-select.md b/docs/sql-ref-syntax-qry-select.md index 83669c045614..62a7f5f08520 100644 --- a/docs/sql-ref-syntax-qry-select.md +++ b/docs/sql-ref-syntax-qry-select.md @@ -41,7 +41,7 @@ select_statement [ { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select_stat While `select_statement` is defined as ```sql -SELECT [ hints , ... ] [ ALL | DISTINCT ] { [[ named_expression | regex_column_names ] [ , ... ] | TRANSFORM ( expression [ , ... ] )) ] } +SELECT [ hints , ... ] [ ALL | DISTINCT ] { [[ named_expression | regex_column_names ] [ , ... ] | TRANSFORM (...)) ] } FROM { from_item [ , ... ] } [ PIVOT clause ] [ LATERAL VIEW clause ] [ ... ] From e37d75d40e7b249ae81ea528dc088284e2278bd0 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 15 Apr 2021 22:11:44 +0800 Subject: [PATCH 21/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index b060f7ed3445..cf29e08dbc59 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -117,6 +117,7 @@ will be discarded. If there is no `AS` clause after `USING my_script`, Spark ass 1. key: which is before the first tab. 2. value: which is the rest after the first tab. +If there is no enough tab, Spark will return `NULL` value in Hive Serde mode or throw `ArrayOutOfBoundsException` in `ROW FORMAT DELIMIT` mode. Note that this is different from specifying an `AS key, value` because in that case, the value will only contain the portion between the first tab and the second tab if there are multiple tabs. User scripts can output debug information to standard error which will be shown on the task detail From c040ef674bb2f5fd8e33c6cd4f90439a525a35b8 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 15 Apr 2021 23:30:41 +0800 Subject: [PATCH 22/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index cf29e08dbc59..9d0cb4855e69 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -22,8 +22,7 @@ license: | ### Description The `TRANSFORM` clause is used to specify a Hive-style transform query specification -to transform the inputs by running a specified command or script. Users can plug in their own custom -command or script in the data stream by using `TRANSFORM` clause. +to transform the inputs by running a user-specified command or script. ### Syntax @@ -117,7 +116,7 @@ will be discarded. If there is no `AS` clause after `USING my_script`, Spark ass 1. key: which is before the first tab. 2. value: which is the rest after the first tab. -If there is no enough tab, Spark will return `NULL` value in Hive Serde mode or throw `ArrayOutOfBoundsException` in `ROW FORMAT DELIMIT` mode. +If there is no enough tab, Spark will return `NULL` value in `SERDE` mode or throw `ArrayOutOfBoundsException` in `DELIMITED` mode. Note that this is different from specifying an `AS key, value` because in that case, the value will only contain the portion between the first tab and the second tab if there are multiple tabs. User scripts can output debug information to standard error which will be shown on the task detail From 76038217f29414ece1df9121e26fd47099612f61 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Mon, 19 Apr 2021 14:26:40 +0800 Subject: [PATCH 23/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 9d0cb4855e69..0ea3d117382d 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -35,7 +35,7 @@ SELECT TRANSFORM ( expression [ , ... ] ) [ RECORDREADER record_reader_class ] row_format: - : SERDE serde_class [ WITH SERDEPROPERTIES (k1=v1, k2=v2, ... ) ] + SERDE serde_class [ WITH SERDEPROPERTIES (k1=v1, k2=v2, ... ) ] | DELIMITED [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escaped_char ] ] [ COLLECTION ITEMS TERMINATED BY collection_items_terminated_char ] [ MAP KEYS TERMINATED BY map_key_terminated_char ] @@ -103,24 +103,21 @@ row_format: ### SerDe behavior -Spark uses Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe` by default, columns will be casted +Spark uses the Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe` by default, so columns will be casted to `STRING` and combined by tabs before feeding to the user script. All `NULL` values will be converted to the literal string `"\N"` in order to differentiate `NULL` values from empty strings. The standard output of the -user script will be treated as TAB-separated STRING columns, any cell containing only `"\N"` will be re-interpreted -as a `NULL`, and then the resulting STRING column will be cast to the data type specified in the table declaration -in the usual way. If the actual number of output columns is less than the number of specified output columns, +user script will be treated as tab-separated `STRING` columns, any cell containing only `"\N"` will be re-interpreted +as a `NULL`, and then the resulting STRING column will be cast to the data type specified in `col_type`. If the actual number of output columns is less than the number of specified output columns, insufficient output columns will be supplemented with `NULL`. If the actual number of output columns is more than the number of specified output columns, the output columns will only select the corresponding columns and the remaining part -will be discarded. If there is no `AS` clause after `USING my_script`, Spark assumes that the output of the script contains 2 parts: +will be discarded. If there is no `AS` clause after `USING my_script`, Spark assumes that the output of the script contains 2 attributes: 1. key: which is before the first tab. 2. value: which is the rest after the first tab. If there is no enough tab, Spark will return `NULL` value in `SERDE` mode or throw `ArrayOutOfBoundsException` in `DELIMITED` mode. Note that this is different from specifying an `AS key, value` because in that case, the value will only contain the portion -between the first tab and the second tab if there are multiple tabs. -User scripts can output debug information to standard error which will be shown on the task detail -page on Spark. These defaults can be overridden with `ROW FORMAT SERDE` or `ROW FORMAT DELIMITED`. +between the first tab and the second tab if there are multiple tabs. These defaults can be overridden with `ROW FORMAT SERDE` or `ROW FORMAT DELIMITED`. ### Examples From 05057d3a6dd5e8bfdd1dd29d698cbc52bae652fb Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Mon, 19 Apr 2021 17:49:12 +0800 Subject: [PATCH 24/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index 0ea3d117382d..bc72da842fae 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -107,17 +107,18 @@ Spark uses the Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe` b to `STRING` and combined by tabs before feeding to the user script. All `NULL` values will be converted to the literal string `"\N"` in order to differentiate `NULL` values from empty strings. The standard output of the user script will be treated as tab-separated `STRING` columns, any cell containing only `"\N"` will be re-interpreted -as a `NULL`, and then the resulting STRING column will be cast to the data type specified in `col_type`. If the actual number of output columns is less than the number of specified output columns, -insufficient output columns will be supplemented with `NULL`. If the actual number of output columns is more than the -number of specified output columns, the output columns will only select the corresponding columns and the remaining part -will be discarded. If there is no `AS` clause after `USING my_script`, Spark assumes that the output of the script contains 2 attributes: +as a `NULL`, and then the resulting STRING column will be cast to the data type specified in `col_type`. If the actual +number of output columns is less than the number of specified output columns, insufficient output columns will be +supplemented with `NULL`. If the actual number of output columns is more than the number of specified output columns, +the output columns will only select the corresponding columns and the remaining part will be discarded. +If there is no `AS` clause after `USING my_script`, Spark assumes that the output of the script contains 2 attributes: 1. key: which is before the first tab. 2. value: which is the rest after the first tab. -If there is no enough tab, Spark will return `NULL` value in `SERDE` mode or throw `ArrayOutOfBoundsException` in `DELIMITED` mode. -Note that this is different from specifying an `AS key, value` because in that case, the value will only contain the portion -between the first tab and the second tab if there are multiple tabs. These defaults can be overridden with `ROW FORMAT SERDE` or `ROW FORMAT DELIMITED`. +If there is no enough tab, Spark will return `NULL` value. Note that this is different from specifying an `AS key, value` +because in that case, the value will only contain the portion between the first tab and the second tab if there are multiple tabs. +These defaults can be overridden with `ROW FORMAT SERDE` or `ROW FORMAT DELIMITED`. ### Examples From 25fa1530d34d86fe189e3c63d65e0366c9abbb67 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Mon, 19 Apr 2021 23:19:01 +0800 Subject: [PATCH 25/25] Update sql-ref-syntax-qry-select-transform.md --- docs/sql-ref-syntax-qry-select-transform.md | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/docs/sql-ref-syntax-qry-select-transform.md b/docs/sql-ref-syntax-qry-select-transform.md index bc72da842fae..814bd01ec2cf 100644 --- a/docs/sql-ref-syntax-qry-select-transform.md +++ b/docs/sql-ref-syntax-qry-select-transform.md @@ -51,7 +51,7 @@ row_format: * **row_format** - Spark uses the `SERDE` clause to specify a custom SerDe for one table. Otherwise, use the `DELIMITED` clause to use the native SerDe and specify the delimiter, escape character, null character and so on. + Otherwise, uses the `DELIMITED` clause to specify the native SerDe and state the delimiter, escape character, null character and so on. * **SERDE** @@ -103,22 +103,17 @@ row_format: ### SerDe behavior -Spark uses the Hive Serde `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe` by default, so columns will be casted +Spark uses the Hive SerDe `org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe` by default, so columns will be casted to `STRING` and combined by tabs before feeding to the user script. All `NULL` values will be converted to the literal string `"\N"` in order to differentiate `NULL` values from empty strings. The standard output of the user script will be treated as tab-separated `STRING` columns, any cell containing only `"\N"` will be re-interpreted -as a `NULL`, and then the resulting STRING column will be cast to the data type specified in `col_type`. If the actual +as a `NULL` value, and then the resulting STRING column will be cast to the data type specified in `col_type`. If the actual number of output columns is less than the number of specified output columns, insufficient output columns will be supplemented with `NULL`. If the actual number of output columns is more than the number of specified output columns, the output columns will only select the corresponding columns and the remaining part will be discarded. -If there is no `AS` clause after `USING my_script`, Spark assumes that the output of the script contains 2 attributes: - - 1. key: which is before the first tab. - 2. value: which is the rest after the first tab. - -If there is no enough tab, Spark will return `NULL` value. Note that this is different from specifying an `AS key, value` -because in that case, the value will only contain the portion between the first tab and the second tab if there are multiple tabs. -These defaults can be overridden with `ROW FORMAT SERDE` or `ROW FORMAT DELIMITED`. +If there is no `AS` clause after `USING my_script`, an output schema will be `key: STRING, value: STRING`. +The `key` column contains all the characters before the first tab and the `value` column contains the remaining characters after the first tab. +If there is no enough tab, Spark will return `NULL` value. These defaults can be overridden with `ROW FORMAT SERDE` or `ROW FORMAT DELIMITED`. ### Examples