Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable parser to parse create external clauses in arbitrary order #6257

Merged
merged 6 commits into from
May 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


###############
# Error tests #
###############

# Missing `STORED AS` and `LOCATION` clauses
statement error DataFusion error: SQL error: ParserError\("Missing STORED AS clause in CREATE EXTERNAL TABLE statement"\)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 these tests are so nice -- thank you @aprimadi

CREATE EXTERNAL TABLE t

# Missing `STORED AS` clause
statement error DataFusion error: SQL error: ParserError\("Missing STORED AS clause in CREATE EXTERNAL TABLE statement"\)
CREATE EXTERNAL TABLE t LOCATION 'foo.csv'

# Missing `LOCATION` clause
statement error DataFusion error: SQL error: ParserError\("Missing LOCATION clause in CREATE EXTERNAL TABLE statement"\)
CREATE EXTERNAL TABLE t STORED AS CSV

# Option value is missing
statement error DataFusion error: SQL error: ParserError\("Expected literal string, found: \)"\)
CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2, k3) LOCATION 'blahblah'

# Missing `(` in WITH ORDER clause
statement error DataFusion error: SQL error: ParserError\("Expected \(, found: c1"\)
CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER c1 LOCATION 'foo.csv'

# Missing `)` in WITH ORDER clause
statement error DataFusion error: SQL error: ParserError\("Expected \), found: LOCATION"\)
CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 LOCATION 'foo.csv'

# Missing `ROW` in WITH HEADER clause
statement error DataFusion error: SQL error: ParserError\("Expected ROW, found: LOCATION"\)
CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER LOCATION 'abc'

# Missing `BY` in PARTITIONED clause
statement error DataFusion error: SQL error: ParserError\("Expected BY, found: LOCATION"\)
CREATE EXTERNAL TABLE t STORED AS CSV PARTITIONED LOCATION 'abc'

# Missing `TYPE` in COMPRESSION clause
statement error DataFusion error: SQL error: ParserError\("Expected TYPE, found: LOCATION"\)
CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION LOCATION 'abc'

# Invalid compression type
statement error DataFusion error: SQL error: ParserError\("Unsupported file compression type ZZZ"\)
CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE ZZZ LOCATION 'blahblah'

# Duplicate `STORED AS` clause
statement error DataFusion error: SQL error: ParserError\("STORED AS specified more than once"\)
CREATE EXTERNAL TABLE t STORED AS CSV STORED AS PARQUET LOCATION 'foo.parquet'

# Duplicate `LOCATION` clause
statement error DataFusion error: SQL error: ParserError\("LOCATION specified more than once"\)
CREATE EXTERNAL TABLE t STORED AS CSV LOCATION 'foo.csv' LOCATION 'bar.csv'

# Duplicate `WITH HEADER ROW` clause
statement error DataFusion error: SQL error: ParserError\("WITH HEADER ROW specified more than once"\)
CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER ROW WITH HEADER ROW LOCATION 'foo.csv'

# Duplicate `WITH ORDER` clause
statement error DataFusion error: SQL error: ParserError\("WITH ORDER specified more than once"\)
CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1) WITH ORDER (c1) LOCATION 'foo.csv'

# Duplicate `DELIMITER` clause
statement error DataFusion error: SQL error: ParserError\("DELIMITER specified more than once"\)
CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV DELIMITER '-' DELIMITER '+' LOCATION 'foo.csv'

# Duplicate `COMPRESSION TYPE` clause
statement error DataFusion error: SQL error: ParserError\("COMPRESSION TYPE specified more than once"\)
CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE BZIP2 COMPRESSION TYPE XZ COMPRESSION TYPE ZSTD COMPRESSION TYPE GZIP LOCATION 'foo.csv'

# Duplicate `PARTITIONED BY` clause
statement error DataFusion error: SQL error: ParserError\("PARTITIONED BY specified more than once"\)
create EXTERNAL TABLE t(c1 int, c2 int) STORED AS CSV PARTITIONED BY (c1) partitioned by (c2) LOCATION 'foo.csv'

# Duplicate `OPTIONS` clause
statement error DataFusion error: SQL error: ParserError\("OPTIONS specified more than once"\)
CREATE EXTERNAL TABLE t STORED AS CSV OPTIONS ('k1' 'v1', 'k2' 'v2') OPTIONS ('k3' 'v3') LOCATION 'foo.csv'
236 changes: 124 additions & 112 deletions datafusion/sql/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -379,59 +379,92 @@ impl<'a> DFParser<'a> {
.parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]);
let table_name = self.parser.parse_object_name()?;
let (columns, _) = self.parse_columns()?;
self.parser
.expect_keywords(&[Keyword::STORED, Keyword::AS])?;

// THIS is the main difference: we parse a different file format.
let file_type = self.parse_file_format()?;

let has_header = self.parse_csv_has_header();

let has_delimiter = self.parse_has_delimiter();
let delimiter = match has_delimiter {
true => self.parse_delimiter()?,
false => ',',
};

let file_compression_type = if self.parse_has_file_compression_type() {
self.parse_file_compression_type()?
} else {
CompressionTypeVariant::UNCOMPRESSED
};

let table_partition_cols = if self.parse_has_partition() {
self.parse_partitions()?
} else {
vec![]
};
#[derive(Default)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

struct Builder {
file_type: Option<String>,
location: Option<String>,
has_header: Option<bool>,
delimiter: Option<char>,
file_compression_type: Option<CompressionTypeVariant>,
table_partition_cols: Option<Vec<String>>,
order_exprs: Option<Vec<OrderByExpr>>,
options: Option<HashMap<String, String>>,
}
let mut builder = Builder::default();

let order_exprs = if self.parse_has_order() {
self.parse_order_by_exprs()?
} else {
vec![]
};
fn ensure_not_set<T>(field: &Option<T>, name: &str) -> Result<(), ParserError> {
if field.is_some() {
return Err(ParserError::ParserError(format!(
"{name} specified more than once",
)));
}
Ok(())
}

let options = if self.parse_has_options() {
self.parse_options()?
} else {
HashMap::new()
};
loop {
if self.parser.parse_keyword(Keyword::STORED) {
self.parser.expect_keyword(Keyword::AS)?;
ensure_not_set(&builder.file_type, "STORED AS")?;
builder.file_type = Some(self.parse_file_format()?);
} else if self.parser.parse_keyword(Keyword::LOCATION) {
ensure_not_set(&builder.location, "LOCATION")?;
builder.location = Some(self.parser.parse_literal_string()?);
} else if self
.parser
.parse_keywords(&[Keyword::WITH, Keyword::HEADER])
{
self.parser.expect_keyword(Keyword::ROW)?;
ensure_not_set(&builder.has_header, "WITH HEADER ROW")?;
builder.has_header = Some(true);
} else if self.parser.parse_keywords(&[Keyword::WITH, Keyword::ORDER]) {
ensure_not_set(&builder.order_exprs, "WITH ORDER")?;
builder.order_exprs = Some(self.parse_order_by_exprs()?);
} else if self.parser.parse_keyword(Keyword::DELIMITER) {
ensure_not_set(&builder.delimiter, "DELIMITER")?;
builder.delimiter = Some(self.parse_delimiter()?);
} else if self.parser.parse_keyword(Keyword::COMPRESSION) {
self.parser.expect_keyword(Keyword::TYPE)?;
ensure_not_set(&builder.file_compression_type, "COMPRESSION TYPE")?;
builder.file_compression_type = Some(self.parse_file_compression_type()?);
} else if self.parser.parse_keyword(Keyword::PARTITIONED) {
self.parser.expect_keyword(Keyword::BY)?;
ensure_not_set(&builder.table_partition_cols, "PARTITIONED BY")?;
builder.table_partition_cols = Some(self.parse_partitions()?)
} else if self.parser.parse_keyword(Keyword::OPTIONS) {
ensure_not_set(&builder.options, "OPTIONS")?;
builder.options = Some(self.parse_options()?);
} else {
break;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure will it be worth adding more checks like expect_one_of_keywords (or other smarter ways) to make the user experience better? (though I couldn't find it's universally applied to all parser paths)

e.g. a typo like

CREATE EXTERNAL TABLE t(c1 int)
STORED AS CSV
WITH HEAD ROW
LOCATION 'foo.csv';

will turn into error sql parser error: Missing LOCATION clause in CREATE EXTERNAL TABLE statement which is a bit confusing.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree this would make a better user experience and I think it would be a good follow on project. I will file the ticket

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}
}

self.parser.expect_keyword(Keyword::LOCATION)?;
let location = self.parser.parse_literal_string()?;
// Validations: location and file_type are required
if builder.file_type.is_none() {
return Err(ParserError::ParserError(
"Missing STORED AS clause in CREATE EXTERNAL TABLE statement".into(),
));
}
if builder.location.is_none() {
return Err(ParserError::ParserError(
"Missing LOCATION clause in CREATE EXTERNAL TABLE statement".into(),
));
}

let create = CreateExternalTable {
name: table_name.to_string(),
columns,
file_type,
has_header,
delimiter,
location,
table_partition_cols,
order_exprs,
file_type: builder.file_type.unwrap(),
has_header: builder.has_header.unwrap_or(false),
delimiter: builder.delimiter.unwrap_or(','),
location: builder.location.unwrap(),
table_partition_cols: builder.table_partition_cols.unwrap_or(vec![]),
order_exprs: builder.order_exprs.unwrap_or(vec![]),
if_not_exists,
file_compression_type,
options,
file_compression_type: builder
.file_compression_type
.unwrap_or(CompressionTypeVariant::UNCOMPRESSED),
options: builder.options.unwrap_or(HashMap::new()),
};
Ok(Statement::CreateExternalTable(create))
}
Expand All @@ -456,11 +489,6 @@ impl<'a> DFParser<'a> {
}
}

fn parse_has_options(&mut self) -> bool {
self.parser.parse_keyword(Keyword::OPTIONS)
}

//
fn parse_options(&mut self) -> Result<HashMap<String, String>, ParserError> {
let mut options: HashMap<String, String> = HashMap::new();
self.parser.expect_token(&Token::LParen)?;
Expand All @@ -483,20 +511,6 @@ impl<'a> DFParser<'a> {
Ok(options)
}

fn parse_has_file_compression_type(&mut self) -> bool {
self.parser
.parse_keywords(&[Keyword::COMPRESSION, Keyword::TYPE])
}

fn parse_csv_has_header(&mut self) -> bool {
self.parser
.parse_keywords(&[Keyword::WITH, Keyword::HEADER, Keyword::ROW])
}

fn parse_has_delimiter(&mut self) -> bool {
self.parser.parse_keyword(Keyword::DELIMITER)
}

fn parse_delimiter(&mut self) -> Result<char, ParserError> {
let token = self.parser.parse_literal_string()?;
match token.len() {
Expand All @@ -506,15 +520,6 @@ impl<'a> DFParser<'a> {
)),
}
}

fn parse_has_partition(&mut self) -> bool {
self.parser
.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY])
}

fn parse_has_order(&mut self) -> bool {
self.parser.parse_keywords(&[Keyword::WITH, Keyword::ORDER])
}
}

#[cfg(test)]
Expand Down Expand Up @@ -900,48 +905,55 @@ mod tests {
});
expect_parse_ok(sql, expected)?;

// Error cases: partition column does not support type
let sql =
"CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2, k3) LOCATION 'blahblah'";
expect_parse_error(sql, "sql parser error: Expected literal string, found: )");

// Error cases: partition column does not support type
let sql =
"CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER c1 LOCATION 'foo.csv'";
expect_parse_error(sql, "sql parser error: Expected (, found: c1");

// Error cases: partition column does not support type
let sql =
"CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 LOCATION 'foo.csv'";
expect_parse_error(sql, "sql parser error: Expected ), found: LOCATION");

// Error case: `with header` is an invalid syntax
let sql = "CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER LOCATION 'abc'";
expect_parse_error(sql, "sql parser error: Expected LOCATION, found: WITH");

// Error case: a single word `partitioned` is invalid
let sql = "CREATE EXTERNAL TABLE t STORED AS CSV PARTITIONED LOCATION 'abc'";
expect_parse_error(
sql,
"sql parser error: Expected LOCATION, found: PARTITIONED",
);
// Most complete CREATE EXTERNAL TABLE statement possible
let sql = "
CREATE EXTERNAL TABLE IF NOT EXISTS t (c1 int, c2 float)
STORED AS PARQUET
DELIMITER '*'
WITH HEADER ROW
WITH ORDER (c1 - c2 ASC)
COMPRESSION TYPE zstd
PARTITIONED BY (c1)
LOCATION 'foo.parquet'
OPTIONS (ROW_GROUP_SIZE '1024', 'TRUNCATE' 'NO')
";
let expected = Statement::CreateExternalTable(CreateExternalTable {
name: "t".into(),
columns: vec![
make_column_def("c1", DataType::Int(None)),
make_column_def("c2", DataType::Float(None)),
],
file_type: "PARQUET".to_string(),
has_header: true,
delimiter: '*',
location: "foo.parquet".into(),
table_partition_cols: vec!["c1".into()],
order_exprs: vec![OrderByExpr {
expr: Expr::BinaryOp {
left: Box::new(Identifier(Ident {
value: "c1".to_owned(),
quote_style: None,
})),
op: BinaryOperator::Minus,
right: Box::new(Identifier(Ident {
value: "c2".to_owned(),
quote_style: None,
})),
},
asc: Some(true),
nulls_first: None,
}],
if_not_exists: true,
file_compression_type: CompressionTypeVariant::ZSTD,
options: HashMap::from([
("ROW_GROUP_SIZE".into(), "1024".into()),
("TRUNCATE".into(), "NO".into()),
]),
});
expect_parse_ok(sql, expected)?;

// Error case: a single word `compression` is invalid
let sql = "CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION LOCATION 'abc'";
expect_parse_error(
sql,
"sql parser error: Expected LOCATION, found: COMPRESSION",
);
// For error cases, see: `create_external_table.slt`

Ok(())
}

#[test]
fn invalid_compression_type() {
let sql = "CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE ZZZ LOCATION 'blahblah'";
expect_parse_error(
sql,
"sql parser error: Unsupported file compression type ZZZ",
)
}
}