Skip to content

Commit e3f8e37

Browse files
hareshkhxanderbaileyxbaileyalamb
authored
[branch-50] Fix ambiguous column names in substrait conversion #17299 (#18077)
## Which issue does this PR close? - Related to #18072 - Related to #17294 - Backports #17299 to branch-50 ## Rationale for this change Fix ambiguous column names in substrait conversion as a result of literals before and after a join being assigned the same name. More information in the issue, but say you have a NULL literal before a join called "column1" and then you create a new NULL column after the join called column2, you would get an ambiguous column name errors like. ``` Error: SchemaError(AmbiguousReference { field: Column { relation: Some(Bare { table: "left" }), name: "UTF8(NULL)" } }, Some("")) ``` ## What changes are included in this PR? Simply alias all literals as they're converted to have a UUID name. ## Are these changes tested? Yes. Tested by using substrait-java with this query ``` ./isthmus-cli/build/graal/isthmus --create "create table A (a int); create table B (a int, c int); create table C (a int, d int)" "select t.*, C.d, CAST(NULL AS VARCHAR) as e from (select a, CAST(NULL AS VARCHAR) as c from A UNION ALL select a, c from B) t LEFT JOIN C ON t.a = C.a" ``` ## Are there any user-facing changes? Co-authored-by: Xander <zander181@googlemail.com> Co-authored-by: Xander Bailey <xbailey@palantir.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 290e3bb commit e3f8e37

File tree

6 files changed

+368
-17
lines changed

6 files changed

+368
-17
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/substrait/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ prost = { workspace = true }
4242
substrait = { version = "0.58", features = ["serde"] }
4343
url = { workspace = true }
4444
tokio = { workspace = true, features = ["fs"] }
45+
uuid = { version = "1.17.0", features = ["v4"] }
4546

4647
[dev-dependencies]
4748
datafusion = { workspace = true, features = ["nested_expressions"] }

datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,20 @@ pub async fn from_project_rel(
6262
// to transform it into a column reference
6363
window_exprs.insert(e.clone());
6464
}
65-
explicit_exprs.push(name_tracker.get_uniquely_named_expr(e)?);
65+
// Substrait plans are ordinal based, so they do not provide names for columns.
66+
// Names for columns are generated by Datafusion during conversion, and for literals
67+
// Datafusion produces names based on the literal value. It is possible to construct
68+
// valid Substrait plans that result in duplicated names if the same literal value is
69+
// used in multiple relations. To avoid this issue, we alias literals with unique names.
70+
// The name tracker will ensure that two literals in the same project would have
71+
// unique names but, it does not ensure that if a literal column exists in a previous
72+
// project say before a join that it is deduplicated with respect to those columns.
73+
// See: https://github.com/apache/datafusion/pull/17299
74+
let maybe_apply_alias = match e {
75+
lit @ Expr::Literal(_, _) => lit.alias(uuid::Uuid::new_v4().to_string()),
76+
_ => e,
77+
};
78+
explicit_exprs.push(name_tracker.get_uniquely_named_expr(maybe_apply_alias)?);
6679
}
6780

6881
let input = if !window_exprs.is_empty() {

datafusion/substrait/tests/cases/consumer_integration.rs

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -651,23 +651,31 @@ mod tests {
651651
#[tokio::test]
652652
async fn test_multiple_unions() -> Result<()> {
653653
let plan_str = test_plan_to_string("multiple_unions.json").await?;
654-
assert_snapshot!(
655-
plan_str,
656-
@r#"
657-
Projection: Utf8("people") AS product_category, Utf8("people")__temp__0 AS product_type, product_key
658-
Union
659-
Projection: Utf8("people"), Utf8("people") AS Utf8("people")__temp__0, sales.product_key
660-
Left Join: sales.product_key = food.@food_id
661-
TableScan: sales
662-
TableScan: food
663-
Union
664-
Projection: people.$f3, people.$f5, people.product_key0
665-
Left Join: people.product_key0 = food.@food_id
666-
TableScan: people
667-
TableScan: food
668-
TableScan: more_products
669-
"#
654+
655+
let mut settings = insta::Settings::clone_current();
656+
settings.add_filter(
657+
r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
658+
"[UUID]",
659+
);
660+
settings.bind(|| {
661+
assert_snapshot!(
662+
plan_str,
663+
@r#"
664+
Projection: [UUID] AS product_category, [UUID] AS product_type, product_key
665+
Union
666+
Projection: Utf8("people") AS [UUID], Utf8("people") AS [UUID], sales.product_key
667+
Left Join: sales.product_key = food.@food_id
668+
TableScan: sales
669+
TableScan: food
670+
Union
671+
Projection: people.$f3, people.$f5, people.product_key0
672+
Left Join: people.product_key0 = food.@food_id
673+
TableScan: people
674+
TableScan: food
675+
TableScan: more_products
676+
"#
670677
);
678+
});
671679

672680
Ok(())
673681
}

datafusion/substrait/tests/cases/logical_plans.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,47 @@ mod tests {
144144
Ok(())
145145
}
146146

147+
#[tokio::test]
148+
async fn null_literal_before_and_after_joins() -> Result<()> {
149+
// Confirms that literals used before and after a join but for different columns
150+
// are correctly handled.
151+
152+
// File generated with substrait-java's Isthmus:
153+
// ./isthmus-cli/build/graal/isthmus --create "create table A (a int); create table B (a int, c int); create table C (a int, d int)" "select t.*, C.d, CAST(NULL AS VARCHAR) as e from (select a, CAST(NULL AS VARCHAR) as c from A UNION ALL select a, c from B) t LEFT JOIN C ON t.a = C.a"
154+
let proto_plan =
155+
read_json("tests/testdata/test_plans/disambiguate_literals_with_same_name.substrait.json");
156+
let ctx = add_plan_schemas_to_ctx(SessionContext::new(), &proto_plan)?;
157+
let plan = from_substrait_plan(&ctx.state(), &proto_plan).await?;
158+
159+
let mut settings = insta::Settings::clone_current();
160+
settings.add_filter(
161+
r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
162+
"[UUID]",
163+
);
164+
settings.bind(|| {
165+
assert_snapshot!(
166+
plan,
167+
@r#"
168+
Projection: left.A, left.[UUID] AS C, right.D, Utf8(NULL) AS [UUID] AS E
169+
Left Join: left.A = right.A
170+
SubqueryAlias: left
171+
Union
172+
Projection: A.A, Utf8(NULL) AS [UUID]
173+
TableScan: A
174+
Projection: B.A, CAST(B.C AS Utf8)
175+
TableScan: B
176+
SubqueryAlias: right
177+
TableScan: C
178+
"#
179+
);
180+
});
181+
182+
// Trigger execution to ensure plan validity
183+
DataFrame::new(ctx.state(), plan).show().await?;
184+
185+
Ok(())
186+
}
187+
147188
#[tokio::test]
148189
async fn non_nullable_lists() -> Result<()> {
149190
// DataFusion's Substrait consumer treats all lists as nullable, even if the Substrait plan specifies them as non-nullable.

0 commit comments

Comments
 (0)