Skip to content

Commit

Permalink
Reject CREATE TABLE/VIEW with duplicate column names
Browse files Browse the repository at this point in the history
DFSchema checks column for uniqueness, but allows duplicate column names
when they are qualified differently. This is because DFSchema plays
central role during query planning as a identifier resolution scope.

Those checks in their current form should not be there, since they
prevent execution of queries with duplicate column aliases, which is
legal in SQL. But even with these checks present, they are not
sufficient to ensure CREATE TABLE/VIEW is well structured. Table or
view columns need to have unique names and there is no qualification
involved.

This commit adds necessary checks in CREATE TABLE/VIEW DDL structs,
ensuring that CREATE TABLE/VIEW logical plans are valid in that regard.
  • Loading branch information
findepi committed Nov 22, 2024
1 parent bd289a5 commit 7dafc6f
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 2 deletions.
15 changes: 15 additions & 0 deletions datafusion/common/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,13 @@ pub enum SchemaError {
qualifier: Box<TableReference>,
name: String,
},
/// Schema duplicate qualified fields with duplicate unqualified names. This is an error
/// for schema in CREATE TABLE/VIEW statements, because the final object is going to retain
/// and return the unqualified names only.
QualifiedFieldWithDuplicateName {
qualifier: Box<TableReference>,
name: String,
},
/// Schema contains duplicate unqualified field name
DuplicateUnqualifiedField { name: String },
/// No field with this name
Expand Down Expand Up @@ -188,6 +195,14 @@ impl Display for SchemaError {
quote_identifier(name)
)
}
Self::QualifiedFieldWithDuplicateName { qualifier, name } => {
write!(
f,
"Schema contains qualified fields with duplicate unqualified names {}.{}",
qualifier.to_quoted_string(),
quote_identifier(name)
)
}
Self::DuplicateUnqualifiedField { name } => {
write!(
f,
Expand Down
135 changes: 133 additions & 2 deletions datafusion/expr/src/logical_plan/ddl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

use crate::{Expr, LogicalPlan, SortExpr, Volatility};
use std::cmp::Ordering;
use std::collections::HashMap;
use std::collections::{BTreeSet, HashMap, HashSet};
use std::sync::Arc;
use std::{
fmt::{self, Display},
Expand All @@ -28,7 +28,8 @@ use crate::expr::Sort;
use arrow::datatypes::DataType;
use datafusion_common::tree_node::{Transformed, TreeNodeContainer, TreeNodeRecursion};
use datafusion_common::{
Constraints, DFSchemaRef, Result, SchemaReference, TableReference,
schema_err, Column, Constraints, DFSchema, DFSchemaRef, Result,
SchemaError, SchemaReference, TableReference,
};
use sqlparser::ast::Ident;

Expand Down Expand Up @@ -306,6 +307,7 @@ impl CreateExternalTable {
constraints,
column_defaults,
} = fields;
check_fields_unique(&schema)?;
Ok(Self {
name,
schema,
Expand Down Expand Up @@ -544,6 +546,7 @@ impl CreateMemoryTable {
column_defaults,
temporary,
} = fields;
check_fields_unique(input.schema())?;
Ok(Self {
name,
constraints,
Expand Down Expand Up @@ -698,6 +701,7 @@ impl CreateView {
definition,
temporary,
} = fields;
check_fields_unique(input.schema())?;
Ok(Self {
name,
input,
Expand Down Expand Up @@ -800,6 +804,48 @@ impl CreateViewBuilder {
})
}
}
fn check_fields_unique(schema: &DFSchema) -> Result<()> {
// Use tree set for deterministic error messages
let mut qualified_names = BTreeSet::new();
let mut unqualified_names = HashSet::new();
let mut name_occurrences: HashMap<&String, usize> = HashMap::new();

for (qualifier, field) in schema.iter() {
if let Some(qualifier) = qualifier {
// Check for duplicate qualified field names
if !qualified_names.insert((qualifier, field.name())) {
return schema_err!(SchemaError::DuplicateQualifiedField {
qualifier: Box::new(qualifier.clone()),
name: field.name().to_string(),
});
}
// Check for duplicate unqualified field names
} else if !unqualified_names.insert(field.name()) {
return schema_err!(SchemaError::DuplicateUnqualifiedField {
name: field.name().to_string()
});
}
*name_occurrences.entry(field.name()).or_default() += 1;
}

for (qualifier, name) in qualified_names {
// Check for duplicate between qualified and unqualified field names
if unqualified_names.contains(name) {
return schema_err!(SchemaError::AmbiguousReference {
field: Column::new(Some(qualifier.clone()), name)
});
}
// Check for duplicates between qualified names as the qualification will be stripped off
if name_occurrences[name] > 1 {
return schema_err!(SchemaError::QualifiedFieldWithDuplicateName {
qualifier: Box::new(qualifier.clone()),
name: name.to_owned(),
});
}
}

Ok(())
}

/// Creates a catalog (aka "Database").
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
Expand Down Expand Up @@ -1085,7 +1131,9 @@ impl PartialOrd for CreateIndex {

#[cfg(test)]
mod test {
use super::*;
use crate::{CreateCatalog, DdlStatement, DropView};
use arrow::datatypes::{DataType, Field, Schema};
use datafusion_common::{DFSchema, DFSchemaRef, TableReference};
use std::cmp::Ordering;

Expand All @@ -1112,4 +1160,87 @@ mod test {

assert_eq!(drop_view.partial_cmp(&catalog), Some(Ordering::Greater));
}

#[test]
fn test_check_fields_unique() -> Result<()> {
// no duplicate fields, unqualified schema
check_fields_unique(&DFSchema::try_from(Schema::new(vec![
Field::new("c100", DataType::Boolean, true),
Field::new("c101", DataType::Boolean, true),
]))?)?;

// no duplicate fields, qualified schema
check_fields_unique(&DFSchema::try_from_qualified_schema(
"t1",
&Schema::new(vec![
Field::new("c100", DataType::Boolean, true),
Field::new("c101", DataType::Boolean, true),
]),
)?)?;

// duplicate unqualified field name
assert_eq!(
check_fields_unique(&DFSchema::try_from(Schema::new(vec![
Field::new("c0", DataType::Boolean, true),
Field::new("c1", DataType::Boolean, true),
Field::new("c1", DataType::Boolean, true),
Field::new("c2", DataType::Boolean, true),
]))?)
.unwrap_err()
.strip_backtrace()
.to_string(),
"Schema error: Schema contains duplicate unqualified field name c1"
);

// duplicate qualified field with same qualifier
assert_eq!(
DFSchema::try_from_qualified_schema(
"t1",
&Schema::new(vec![
Field::new("c1", DataType::Boolean, true),
Field::new("c1", DataType::Boolean, true),
])
)
// if schema construction succeeds (due to future changes in DFSchema), call check_fields_unique on it
.unwrap_err()
.strip_backtrace()
.to_string(),
"Schema error: Schema contains duplicate qualified field name t1.c1"
);

// duplicate qualified and unqualified field
assert_eq!(
DFSchema::from_field_specific_qualified_schema(
vec![
None,
Some(TableReference::from("t1")),
],
&Arc::new(Schema::new(vec![
Field::new("c1", DataType::Boolean, true),
Field::new("c1", DataType::Boolean, true),
]))
)
// if schema construction succeeds (due to future changes in DFSchema), call check_fields_unique on it
.unwrap_err().strip_backtrace().to_string(),
"Schema error: Schema contains qualified field name t1.c1 and unqualified field name c1 which would be ambiguous"
);

// qualified fields with duplicate unqualified names
assert_eq!(
check_fields_unique(&DFSchema::from_field_specific_qualified_schema(
vec![
Some(TableReference::from("t1")),
Some(TableReference::from("t2")),
],
&Arc::new(Schema::new(vec![
Field::new("c1", DataType::Boolean, true),
Field::new("c1", DataType::Boolean, true),
]))
)?)
.unwrap_err().strip_backtrace().to_string(),
"Schema error: Schema contains qualified fields with duplicate unqualified names t1.c1"
);

Ok(())
}
}
20 changes: 20 additions & 0 deletions datafusion/sqllogictest/test_files/create_table.slt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Issue https://github.com/apache/datafusion/issues/13487
statement error DataFusion error: Schema error: Schema contains qualified fields with duplicate unqualified names l\.id
CREATE TABLE t AS SELECT * FROM (SELECT 1 AS id, 'Foo' AS name) l JOIN (SELECT 1 AS id, 'Bar' as name) r ON l.id = r.id;

0 comments on commit 7dafc6f

Please sign in to comment.