-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enable parser to parse create external clauses in arbitrary order #6257
Changes from all commits
d061483
7ba59d0
9732d3f
719b5b2
d054815
a268bc7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
|
||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
|
||
|
||
############### | ||
# Error tests # | ||
############### | ||
|
||
# Missing `STORED AS` and `LOCATION` clauses | ||
statement error DataFusion error: SQL error: ParserError\("Missing STORED AS clause in CREATE EXTERNAL TABLE statement"\) | ||
CREATE EXTERNAL TABLE t | ||
|
||
# Missing `STORED AS` clause | ||
statement error DataFusion error: SQL error: ParserError\("Missing STORED AS clause in CREATE EXTERNAL TABLE statement"\) | ||
CREATE EXTERNAL TABLE t LOCATION 'foo.csv' | ||
|
||
# Missing `LOCATION` clause | ||
statement error DataFusion error: SQL error: ParserError\("Missing LOCATION clause in CREATE EXTERNAL TABLE statement"\) | ||
CREATE EXTERNAL TABLE t STORED AS CSV | ||
|
||
# Option value is missing | ||
statement error DataFusion error: SQL error: ParserError\("Expected literal string, found: \)"\) | ||
CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2, k3) LOCATION 'blahblah' | ||
|
||
# Missing `(` in WITH ORDER clause | ||
statement error DataFusion error: SQL error: ParserError\("Expected \(, found: c1"\) | ||
CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER c1 LOCATION 'foo.csv' | ||
|
||
# Missing `)` in WITH ORDER clause | ||
statement error DataFusion error: SQL error: ParserError\("Expected \), found: LOCATION"\) | ||
CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 LOCATION 'foo.csv' | ||
|
||
# Missing `ROW` in WITH HEADER clause | ||
statement error DataFusion error: SQL error: ParserError\("Expected ROW, found: LOCATION"\) | ||
CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER LOCATION 'abc' | ||
|
||
# Missing `BY` in PARTITIONED clause | ||
statement error DataFusion error: SQL error: ParserError\("Expected BY, found: LOCATION"\) | ||
CREATE EXTERNAL TABLE t STORED AS CSV PARTITIONED LOCATION 'abc' | ||
|
||
# Missing `TYPE` in COMPRESSION clause | ||
statement error DataFusion error: SQL error: ParserError\("Expected TYPE, found: LOCATION"\) | ||
CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION LOCATION 'abc' | ||
|
||
# Invalid compression type | ||
statement error DataFusion error: SQL error: ParserError\("Unsupported file compression type ZZZ"\) | ||
CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE ZZZ LOCATION 'blahblah' | ||
|
||
# Duplicate `STORED AS` clause | ||
statement error DataFusion error: SQL error: ParserError\("STORED AS specified more than once"\) | ||
CREATE EXTERNAL TABLE t STORED AS CSV STORED AS PARQUET LOCATION 'foo.parquet' | ||
|
||
# Duplicate `LOCATION` clause | ||
statement error DataFusion error: SQL error: ParserError\("LOCATION specified more than once"\) | ||
CREATE EXTERNAL TABLE t STORED AS CSV LOCATION 'foo.csv' LOCATION 'bar.csv' | ||
|
||
# Duplicate `WITH HEADER ROW` clause | ||
statement error DataFusion error: SQL error: ParserError\("WITH HEADER ROW specified more than once"\) | ||
CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER ROW WITH HEADER ROW LOCATION 'foo.csv' | ||
|
||
# Duplicate `WITH ORDER` clause | ||
statement error DataFusion error: SQL error: ParserError\("WITH ORDER specified more than once"\) | ||
CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1) WITH ORDER (c1) LOCATION 'foo.csv' | ||
|
||
# Duplicate `DELIMITER` clause | ||
statement error DataFusion error: SQL error: ParserError\("DELIMITER specified more than once"\) | ||
CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV DELIMITER '-' DELIMITER '+' LOCATION 'foo.csv' | ||
|
||
# Duplicate `COMPRESSION TYPE` clause | ||
statement error DataFusion error: SQL error: ParserError\("COMPRESSION TYPE specified more than once"\) | ||
CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE BZIP2 COMPRESSION TYPE XZ COMPRESSION TYPE ZSTD COMPRESSION TYPE GZIP LOCATION 'foo.csv' | ||
|
||
# Duplicate `PARTITIONED BY` clause | ||
statement error DataFusion error: SQL error: ParserError\("PARTITIONED BY specified more than once"\) | ||
create EXTERNAL TABLE t(c1 int, c2 int) STORED AS CSV PARTITIONED BY (c1) partitioned by (c2) LOCATION 'foo.csv' | ||
|
||
# Duplicate `OPTIONS` clause | ||
statement error DataFusion error: SQL error: ParserError\("OPTIONS specified more than once"\) | ||
CREATE EXTERNAL TABLE t STORED AS CSV OPTIONS ('k1' 'v1', 'k2' 'v2') OPTIONS ('k3' 'v3') LOCATION 'foo.csv' |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -379,59 +379,92 @@ impl<'a> DFParser<'a> { | |
.parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]); | ||
let table_name = self.parser.parse_object_name()?; | ||
let (columns, _) = self.parse_columns()?; | ||
self.parser | ||
.expect_keywords(&[Keyword::STORED, Keyword::AS])?; | ||
|
||
// THIS is the main difference: we parse a different file format. | ||
let file_type = self.parse_file_format()?; | ||
|
||
let has_header = self.parse_csv_has_header(); | ||
|
||
let has_delimiter = self.parse_has_delimiter(); | ||
let delimiter = match has_delimiter { | ||
true => self.parse_delimiter()?, | ||
false => ',', | ||
}; | ||
|
||
let file_compression_type = if self.parse_has_file_compression_type() { | ||
self.parse_file_compression_type()? | ||
} else { | ||
CompressionTypeVariant::UNCOMPRESSED | ||
}; | ||
|
||
let table_partition_cols = if self.parse_has_partition() { | ||
self.parse_partitions()? | ||
} else { | ||
vec![] | ||
}; | ||
#[derive(Default)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
struct Builder { | ||
file_type: Option<String>, | ||
location: Option<String>, | ||
has_header: Option<bool>, | ||
delimiter: Option<char>, | ||
file_compression_type: Option<CompressionTypeVariant>, | ||
table_partition_cols: Option<Vec<String>>, | ||
order_exprs: Option<Vec<OrderByExpr>>, | ||
options: Option<HashMap<String, String>>, | ||
} | ||
let mut builder = Builder::default(); | ||
|
||
let order_exprs = if self.parse_has_order() { | ||
self.parse_order_by_exprs()? | ||
} else { | ||
vec![] | ||
}; | ||
fn ensure_not_set<T>(field: &Option<T>, name: &str) -> Result<(), ParserError> { | ||
if field.is_some() { | ||
return Err(ParserError::ParserError(format!( | ||
"{name} specified more than once", | ||
))); | ||
} | ||
Ok(()) | ||
} | ||
|
||
let options = if self.parse_has_options() { | ||
self.parse_options()? | ||
} else { | ||
HashMap::new() | ||
}; | ||
loop { | ||
if self.parser.parse_keyword(Keyword::STORED) { | ||
self.parser.expect_keyword(Keyword::AS)?; | ||
ensure_not_set(&builder.file_type, "STORED AS")?; | ||
builder.file_type = Some(self.parse_file_format()?); | ||
} else if self.parser.parse_keyword(Keyword::LOCATION) { | ||
ensure_not_set(&builder.location, "LOCATION")?; | ||
builder.location = Some(self.parser.parse_literal_string()?); | ||
} else if self | ||
.parser | ||
.parse_keywords(&[Keyword::WITH, Keyword::HEADER]) | ||
{ | ||
self.parser.expect_keyword(Keyword::ROW)?; | ||
ensure_not_set(&builder.has_header, "WITH HEADER ROW")?; | ||
builder.has_header = Some(true); | ||
} else if self.parser.parse_keywords(&[Keyword::WITH, Keyword::ORDER]) { | ||
ensure_not_set(&builder.order_exprs, "WITH ORDER")?; | ||
builder.order_exprs = Some(self.parse_order_by_exprs()?); | ||
} else if self.parser.parse_keyword(Keyword::DELIMITER) { | ||
ensure_not_set(&builder.delimiter, "DELIMITER")?; | ||
builder.delimiter = Some(self.parse_delimiter()?); | ||
} else if self.parser.parse_keyword(Keyword::COMPRESSION) { | ||
self.parser.expect_keyword(Keyword::TYPE)?; | ||
ensure_not_set(&builder.file_compression_type, "COMPRESSION TYPE")?; | ||
builder.file_compression_type = Some(self.parse_file_compression_type()?); | ||
} else if self.parser.parse_keyword(Keyword::PARTITIONED) { | ||
self.parser.expect_keyword(Keyword::BY)?; | ||
ensure_not_set(&builder.table_partition_cols, "PARTITIONED BY")?; | ||
builder.table_partition_cols = Some(self.parse_partitions()?) | ||
} else if self.parser.parse_keyword(Keyword::OPTIONS) { | ||
ensure_not_set(&builder.options, "OPTIONS")?; | ||
builder.options = Some(self.parse_options()?); | ||
} else { | ||
break; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure will it be worth adding more checks like e.g. a typo like CREATE EXTERNAL TABLE t(c1 int)
STORED AS CSV
WITH HEAD ROW
LOCATION 'foo.csv'; will turn into error There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree this would make a better user experience and I think it would be a good follow on project. I will file the ticket There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
} | ||
} | ||
|
||
self.parser.expect_keyword(Keyword::LOCATION)?; | ||
let location = self.parser.parse_literal_string()?; | ||
// Validations: location and file_type are required | ||
if builder.file_type.is_none() { | ||
return Err(ParserError::ParserError( | ||
"Missing STORED AS clause in CREATE EXTERNAL TABLE statement".into(), | ||
)); | ||
} | ||
if builder.location.is_none() { | ||
return Err(ParserError::ParserError( | ||
"Missing LOCATION clause in CREATE EXTERNAL TABLE statement".into(), | ||
)); | ||
} | ||
|
||
let create = CreateExternalTable { | ||
name: table_name.to_string(), | ||
columns, | ||
file_type, | ||
has_header, | ||
delimiter, | ||
location, | ||
table_partition_cols, | ||
order_exprs, | ||
file_type: builder.file_type.unwrap(), | ||
has_header: builder.has_header.unwrap_or(false), | ||
delimiter: builder.delimiter.unwrap_or(','), | ||
location: builder.location.unwrap(), | ||
table_partition_cols: builder.table_partition_cols.unwrap_or(vec![]), | ||
order_exprs: builder.order_exprs.unwrap_or(vec![]), | ||
if_not_exists, | ||
file_compression_type, | ||
options, | ||
file_compression_type: builder | ||
.file_compression_type | ||
.unwrap_or(CompressionTypeVariant::UNCOMPRESSED), | ||
options: builder.options.unwrap_or(HashMap::new()), | ||
}; | ||
Ok(Statement::CreateExternalTable(create)) | ||
} | ||
|
@@ -456,11 +489,6 @@ impl<'a> DFParser<'a> { | |
} | ||
} | ||
|
||
fn parse_has_options(&mut self) -> bool { | ||
self.parser.parse_keyword(Keyword::OPTIONS) | ||
} | ||
|
||
// | ||
fn parse_options(&mut self) -> Result<HashMap<String, String>, ParserError> { | ||
let mut options: HashMap<String, String> = HashMap::new(); | ||
self.parser.expect_token(&Token::LParen)?; | ||
|
@@ -483,20 +511,6 @@ impl<'a> DFParser<'a> { | |
Ok(options) | ||
} | ||
|
||
fn parse_has_file_compression_type(&mut self) -> bool { | ||
self.parser | ||
.parse_keywords(&[Keyword::COMPRESSION, Keyword::TYPE]) | ||
} | ||
|
||
fn parse_csv_has_header(&mut self) -> bool { | ||
self.parser | ||
.parse_keywords(&[Keyword::WITH, Keyword::HEADER, Keyword::ROW]) | ||
} | ||
|
||
fn parse_has_delimiter(&mut self) -> bool { | ||
self.parser.parse_keyword(Keyword::DELIMITER) | ||
} | ||
|
||
fn parse_delimiter(&mut self) -> Result<char, ParserError> { | ||
let token = self.parser.parse_literal_string()?; | ||
match token.len() { | ||
|
@@ -506,15 +520,6 @@ impl<'a> DFParser<'a> { | |
)), | ||
} | ||
} | ||
|
||
fn parse_has_partition(&mut self) -> bool { | ||
self.parser | ||
.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) | ||
} | ||
|
||
fn parse_has_order(&mut self) -> bool { | ||
self.parser.parse_keywords(&[Keyword::WITH, Keyword::ORDER]) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
|
@@ -900,48 +905,55 @@ mod tests { | |
}); | ||
expect_parse_ok(sql, expected)?; | ||
|
||
// Error cases: partition column does not support type | ||
let sql = | ||
"CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2, k3) LOCATION 'blahblah'"; | ||
expect_parse_error(sql, "sql parser error: Expected literal string, found: )"); | ||
|
||
// Error cases: partition column does not support type | ||
let sql = | ||
"CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER c1 LOCATION 'foo.csv'"; | ||
expect_parse_error(sql, "sql parser error: Expected (, found: c1"); | ||
|
||
// Error cases: partition column does not support type | ||
let sql = | ||
"CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 LOCATION 'foo.csv'"; | ||
expect_parse_error(sql, "sql parser error: Expected ), found: LOCATION"); | ||
|
||
// Error case: `with header` is an invalid syntax | ||
let sql = "CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER LOCATION 'abc'"; | ||
expect_parse_error(sql, "sql parser error: Expected LOCATION, found: WITH"); | ||
|
||
// Error case: a single word `partitioned` is invalid | ||
let sql = "CREATE EXTERNAL TABLE t STORED AS CSV PARTITIONED LOCATION 'abc'"; | ||
expect_parse_error( | ||
sql, | ||
"sql parser error: Expected LOCATION, found: PARTITIONED", | ||
); | ||
// Most complete CREATE EXTERNAL TABLE statement possible | ||
let sql = " | ||
CREATE EXTERNAL TABLE IF NOT EXISTS t (c1 int, c2 float) | ||
STORED AS PARQUET | ||
DELIMITER '*' | ||
WITH HEADER ROW | ||
WITH ORDER (c1 - c2 ASC) | ||
COMPRESSION TYPE zstd | ||
PARTITIONED BY (c1) | ||
LOCATION 'foo.parquet' | ||
OPTIONS (ROW_GROUP_SIZE '1024', 'TRUNCATE' 'NO') | ||
"; | ||
let expected = Statement::CreateExternalTable(CreateExternalTable { | ||
name: "t".into(), | ||
columns: vec![ | ||
make_column_def("c1", DataType::Int(None)), | ||
make_column_def("c2", DataType::Float(None)), | ||
], | ||
file_type: "PARQUET".to_string(), | ||
has_header: true, | ||
delimiter: '*', | ||
location: "foo.parquet".into(), | ||
table_partition_cols: vec!["c1".into()], | ||
order_exprs: vec![OrderByExpr { | ||
expr: Expr::BinaryOp { | ||
left: Box::new(Identifier(Ident { | ||
value: "c1".to_owned(), | ||
quote_style: None, | ||
})), | ||
op: BinaryOperator::Minus, | ||
right: Box::new(Identifier(Ident { | ||
value: "c2".to_owned(), | ||
quote_style: None, | ||
})), | ||
}, | ||
asc: Some(true), | ||
nulls_first: None, | ||
}], | ||
if_not_exists: true, | ||
file_compression_type: CompressionTypeVariant::ZSTD, | ||
options: HashMap::from([ | ||
("ROW_GROUP_SIZE".into(), "1024".into()), | ||
("TRUNCATE".into(), "NO".into()), | ||
]), | ||
}); | ||
expect_parse_ok(sql, expected)?; | ||
|
||
// Error case: a single word `compression` is invalid | ||
let sql = "CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION LOCATION 'abc'"; | ||
expect_parse_error( | ||
sql, | ||
"sql parser error: Expected LOCATION, found: COMPRESSION", | ||
); | ||
// For error cases, see: `create_external_table.slt` | ||
|
||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn invalid_compression_type() { | ||
let sql = "CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE ZZZ LOCATION 'blahblah'"; | ||
expect_parse_error( | ||
sql, | ||
"sql parser error: Unsupported file compression type ZZZ", | ||
) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍 these tests are so nice -- thank you @aprimadi