Skip to content

Commit

Permalink
Enable parser to parse create external clauses in arbitrary order (#6257
Browse files Browse the repository at this point in the history
)

* Enable parser to parse create external clauses in random order

* Add more test

* Port error tests to sqllogic

* Adding a bunch of duplicate tests

* Add complete statement test

* Fix cargo fmt
  • Loading branch information
aprimadi authored May 8, 2023
1 parent 2e9beeb commit d94378e
Show file tree
Hide file tree
Showing 2 changed files with 217 additions and 112 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


###############
# Error tests #
###############

# Missing `STORED AS` and `LOCATION` clauses
statement error DataFusion error: SQL error: ParserError\("Missing STORED AS clause in CREATE EXTERNAL TABLE statement"\)
CREATE EXTERNAL TABLE t

# Missing `STORED AS` clause
statement error DataFusion error: SQL error: ParserError\("Missing STORED AS clause in CREATE EXTERNAL TABLE statement"\)
CREATE EXTERNAL TABLE t LOCATION 'foo.csv'

# Missing `LOCATION` clause
statement error DataFusion error: SQL error: ParserError\("Missing LOCATION clause in CREATE EXTERNAL TABLE statement"\)
CREATE EXTERNAL TABLE t STORED AS CSV

# Option value is missing
statement error DataFusion error: SQL error: ParserError\("Expected literal string, found: \)"\)
CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2, k3) LOCATION 'blahblah'

# Missing `(` in WITH ORDER clause
statement error DataFusion error: SQL error: ParserError\("Expected \(, found: c1"\)
CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER c1 LOCATION 'foo.csv'

# Missing `)` in WITH ORDER clause
statement error DataFusion error: SQL error: ParserError\("Expected \), found: LOCATION"\)
CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 LOCATION 'foo.csv'

# Missing `ROW` in WITH HEADER clause
statement error DataFusion error: SQL error: ParserError\("Expected ROW, found: LOCATION"\)
CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER LOCATION 'abc'

# Missing `BY` in PARTITIONED clause
statement error DataFusion error: SQL error: ParserError\("Expected BY, found: LOCATION"\)
CREATE EXTERNAL TABLE t STORED AS CSV PARTITIONED LOCATION 'abc'

# Missing `TYPE` in COMPRESSION clause
statement error DataFusion error: SQL error: ParserError\("Expected TYPE, found: LOCATION"\)
CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION LOCATION 'abc'

# Invalid compression type
statement error DataFusion error: SQL error: ParserError\("Unsupported file compression type ZZZ"\)
CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE ZZZ LOCATION 'blahblah'

# Duplicate `STORED AS` clause
statement error DataFusion error: SQL error: ParserError\("STORED AS specified more than once"\)
CREATE EXTERNAL TABLE t STORED AS CSV STORED AS PARQUET LOCATION 'foo.parquet'

# Duplicate `LOCATION` clause
statement error DataFusion error: SQL error: ParserError\("LOCATION specified more than once"\)
CREATE EXTERNAL TABLE t STORED AS CSV LOCATION 'foo.csv' LOCATION 'bar.csv'

# Duplicate `WITH HEADER ROW` clause
statement error DataFusion error: SQL error: ParserError\("WITH HEADER ROW specified more than once"\)
CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER ROW WITH HEADER ROW LOCATION 'foo.csv'

# Duplicate `WITH ORDER` clause
statement error DataFusion error: SQL error: ParserError\("WITH ORDER specified more than once"\)
CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1) WITH ORDER (c1) LOCATION 'foo.csv'

# Duplicate `DELIMITER` clause
statement error DataFusion error: SQL error: ParserError\("DELIMITER specified more than once"\)
CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV DELIMITER '-' DELIMITER '+' LOCATION 'foo.csv'

# Duplicate `COMPRESSION TYPE` clause
statement error DataFusion error: SQL error: ParserError\("COMPRESSION TYPE specified more than once"\)
CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE BZIP2 COMPRESSION TYPE XZ COMPRESSION TYPE ZSTD COMPRESSION TYPE GZIP LOCATION 'foo.csv'

# Duplicate `PARTITIONED BY` clause
statement error DataFusion error: SQL error: ParserError\("PARTITIONED BY specified more than once"\)
create EXTERNAL TABLE t(c1 int, c2 int) STORED AS CSV PARTITIONED BY (c1) partitioned by (c2) LOCATION 'foo.csv'

# Duplicate `OPTIONS` clause
statement error DataFusion error: SQL error: ParserError\("OPTIONS specified more than once"\)
CREATE EXTERNAL TABLE t STORED AS CSV OPTIONS ('k1' 'v1', 'k2' 'v2') OPTIONS ('k3' 'v3') LOCATION 'foo.csv'
236 changes: 124 additions & 112 deletions datafusion/sql/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -403,59 +403,92 @@ impl<'a> DFParser<'a> {
.parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]);
let table_name = self.parser.parse_object_name()?;
let (columns, _) = self.parse_columns()?;
self.parser
.expect_keywords(&[Keyword::STORED, Keyword::AS])?;

// THIS is the main difference: we parse a different file format.
let file_type = self.parse_file_format()?;

let has_header = self.parse_csv_has_header();

let has_delimiter = self.parse_has_delimiter();
let delimiter = match has_delimiter {
true => self.parse_delimiter()?,
false => ',',
};

let file_compression_type = if self.parse_has_file_compression_type() {
self.parse_file_compression_type()?
} else {
CompressionTypeVariant::UNCOMPRESSED
};

let table_partition_cols = if self.parse_has_partition() {
self.parse_partitions()?
} else {
vec![]
};
#[derive(Default)]
struct Builder {
file_type: Option<String>,
location: Option<String>,
has_header: Option<bool>,
delimiter: Option<char>,
file_compression_type: Option<CompressionTypeVariant>,
table_partition_cols: Option<Vec<String>>,
order_exprs: Option<Vec<OrderByExpr>>,
options: Option<HashMap<String, String>>,
}
let mut builder = Builder::default();

let order_exprs = if self.parse_has_order() {
self.parse_order_by_exprs()?
} else {
vec![]
};
fn ensure_not_set<T>(field: &Option<T>, name: &str) -> Result<(), ParserError> {
if field.is_some() {
return Err(ParserError::ParserError(format!(
"{name} specified more than once",
)));
}
Ok(())
}

let options = if self.parse_has_options() {
self.parse_options()?
} else {
HashMap::new()
};
loop {
if self.parser.parse_keyword(Keyword::STORED) {
self.parser.expect_keyword(Keyword::AS)?;
ensure_not_set(&builder.file_type, "STORED AS")?;
builder.file_type = Some(self.parse_file_format()?);
} else if self.parser.parse_keyword(Keyword::LOCATION) {
ensure_not_set(&builder.location, "LOCATION")?;
builder.location = Some(self.parser.parse_literal_string()?);
} else if self
.parser
.parse_keywords(&[Keyword::WITH, Keyword::HEADER])
{
self.parser.expect_keyword(Keyword::ROW)?;
ensure_not_set(&builder.has_header, "WITH HEADER ROW")?;
builder.has_header = Some(true);
} else if self.parser.parse_keywords(&[Keyword::WITH, Keyword::ORDER]) {
ensure_not_set(&builder.order_exprs, "WITH ORDER")?;
builder.order_exprs = Some(self.parse_order_by_exprs()?);
} else if self.parser.parse_keyword(Keyword::DELIMITER) {
ensure_not_set(&builder.delimiter, "DELIMITER")?;
builder.delimiter = Some(self.parse_delimiter()?);
} else if self.parser.parse_keyword(Keyword::COMPRESSION) {
self.parser.expect_keyword(Keyword::TYPE)?;
ensure_not_set(&builder.file_compression_type, "COMPRESSION TYPE")?;
builder.file_compression_type = Some(self.parse_file_compression_type()?);
} else if self.parser.parse_keyword(Keyword::PARTITIONED) {
self.parser.expect_keyword(Keyword::BY)?;
ensure_not_set(&builder.table_partition_cols, "PARTITIONED BY")?;
builder.table_partition_cols = Some(self.parse_partitions()?)
} else if self.parser.parse_keyword(Keyword::OPTIONS) {
ensure_not_set(&builder.options, "OPTIONS")?;
builder.options = Some(self.parse_options()?);
} else {
break;
}
}

self.parser.expect_keyword(Keyword::LOCATION)?;
let location = self.parser.parse_literal_string()?;
// Validations: location and file_type are required
if builder.file_type.is_none() {
return Err(ParserError::ParserError(
"Missing STORED AS clause in CREATE EXTERNAL TABLE statement".into(),
));
}
if builder.location.is_none() {
return Err(ParserError::ParserError(
"Missing LOCATION clause in CREATE EXTERNAL TABLE statement".into(),
));
}

let create = CreateExternalTable {
name: table_name.to_string(),
columns,
file_type,
has_header,
delimiter,
location,
table_partition_cols,
order_exprs,
file_type: builder.file_type.unwrap(),
has_header: builder.has_header.unwrap_or(false),
delimiter: builder.delimiter.unwrap_or(','),
location: builder.location.unwrap(),
table_partition_cols: builder.table_partition_cols.unwrap_or(vec![]),
order_exprs: builder.order_exprs.unwrap_or(vec![]),
if_not_exists,
file_compression_type,
options,
file_compression_type: builder
.file_compression_type
.unwrap_or(CompressionTypeVariant::UNCOMPRESSED),
options: builder.options.unwrap_or(HashMap::new()),
};
Ok(Statement::CreateExternalTable(create))
}
Expand All @@ -480,11 +513,6 @@ impl<'a> DFParser<'a> {
}
}

fn parse_has_options(&mut self) -> bool {
self.parser.parse_keyword(Keyword::OPTIONS)
}

//
fn parse_options(&mut self) -> Result<HashMap<String, String>, ParserError> {
let mut options: HashMap<String, String> = HashMap::new();
self.parser.expect_token(&Token::LParen)?;
Expand All @@ -507,20 +535,6 @@ impl<'a> DFParser<'a> {
Ok(options)
}

fn parse_has_file_compression_type(&mut self) -> bool {
self.parser
.parse_keywords(&[Keyword::COMPRESSION, Keyword::TYPE])
}

fn parse_csv_has_header(&mut self) -> bool {
self.parser
.parse_keywords(&[Keyword::WITH, Keyword::HEADER, Keyword::ROW])
}

fn parse_has_delimiter(&mut self) -> bool {
self.parser.parse_keyword(Keyword::DELIMITER)
}

fn parse_delimiter(&mut self) -> Result<char, ParserError> {
let token = self.parser.parse_literal_string()?;
match token.len() {
Expand All @@ -530,15 +544,6 @@ impl<'a> DFParser<'a> {
)),
}
}

fn parse_has_partition(&mut self) -> bool {
self.parser
.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY])
}

fn parse_has_order(&mut self) -> bool {
self.parser.parse_keywords(&[Keyword::WITH, Keyword::ORDER])
}
}

#[cfg(test)]
Expand Down Expand Up @@ -924,48 +929,55 @@ mod tests {
});
expect_parse_ok(sql, expected)?;

// Error cases: partition column does not support type
let sql =
"CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2, k3) LOCATION 'blahblah'";
expect_parse_error(sql, "sql parser error: Expected literal string, found: )");

// Error cases: partition column does not support type
let sql =
"CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER c1 LOCATION 'foo.csv'";
expect_parse_error(sql, "sql parser error: Expected (, found: c1");

// Error cases: partition column does not support type
let sql =
"CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 LOCATION 'foo.csv'";
expect_parse_error(sql, "sql parser error: Expected ), found: LOCATION");

// Error case: `with header` is an invalid syntax
let sql = "CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER LOCATION 'abc'";
expect_parse_error(sql, "sql parser error: Expected LOCATION, found: WITH");

// Error case: a single word `partitioned` is invalid
let sql = "CREATE EXTERNAL TABLE t STORED AS CSV PARTITIONED LOCATION 'abc'";
expect_parse_error(
sql,
"sql parser error: Expected LOCATION, found: PARTITIONED",
);
// Most complete CREATE EXTERNAL TABLE statement possible
let sql = "
CREATE EXTERNAL TABLE IF NOT EXISTS t (c1 int, c2 float)
STORED AS PARQUET
DELIMITER '*'
WITH HEADER ROW
WITH ORDER (c1 - c2 ASC)
COMPRESSION TYPE zstd
PARTITIONED BY (c1)
LOCATION 'foo.parquet'
OPTIONS (ROW_GROUP_SIZE '1024', 'TRUNCATE' 'NO')
";
let expected = Statement::CreateExternalTable(CreateExternalTable {
name: "t".into(),
columns: vec![
make_column_def("c1", DataType::Int(None)),
make_column_def("c2", DataType::Float(None)),
],
file_type: "PARQUET".to_string(),
has_header: true,
delimiter: '*',
location: "foo.parquet".into(),
table_partition_cols: vec!["c1".into()],
order_exprs: vec![OrderByExpr {
expr: Expr::BinaryOp {
left: Box::new(Identifier(Ident {
value: "c1".to_owned(),
quote_style: None,
})),
op: BinaryOperator::Minus,
right: Box::new(Identifier(Ident {
value: "c2".to_owned(),
quote_style: None,
})),
},
asc: Some(true),
nulls_first: None,
}],
if_not_exists: true,
file_compression_type: CompressionTypeVariant::ZSTD,
options: HashMap::from([
("ROW_GROUP_SIZE".into(), "1024".into()),
("TRUNCATE".into(), "NO".into()),
]),
});
expect_parse_ok(sql, expected)?;

// Error case: a single word `compression` is invalid
let sql = "CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION LOCATION 'abc'";
expect_parse_error(
sql,
"sql parser error: Expected LOCATION, found: COMPRESSION",
);
// For error cases, see: `create_external_table.slt`

Ok(())
}

#[test]
fn invalid_compression_type() {
let sql = "CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE ZZZ LOCATION 'blahblah'";
expect_parse_error(
sql,
"sql parser error: Unsupported file compression type ZZZ",
)
}
}

0 comments on commit d94378e

Please sign in to comment.