-
Notifications
You must be signed in to change notification settings - Fork 28.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-43333][SQL] Allow Avro to convert union type to SQL with field name stable with type #41263
Changes from all commits
c4137f9
c22dc11
fa0a86c
c4c87a3
5ec1ace
09123bd
2f9ae46
ff507fd
75f3f6a
da017a9
4232d36
030054c
d3c10b0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,10 @@ | |
|
||
package org.apache.spark.sql.avro | ||
|
||
import java.util.Locale | ||
|
||
import scala.collection.JavaConverters._ | ||
import scala.collection.mutable | ||
|
||
import org.apache.avro.{LogicalTypes, Schema, SchemaBuilder} | ||
import org.apache.avro.LogicalTypes.{Date, Decimal, LocalTimestampMicros, LocalTimestampMillis, TimestampMicros, TimestampMillis} | ||
|
@@ -49,13 +52,19 @@ object SchemaConverters { | |
* @since 2.4.0 | ||
*/ | ||
def toSqlType(avroSchema: Schema): SchemaType = { | ||
toSqlTypeHelper(avroSchema, Set.empty) | ||
toSqlTypeHelper(avroSchema, Set.empty, AvroOptions(Map())) | ||
} | ||
def toSqlType(avroSchema: Schema, options: Map[String, String]): SchemaType = { | ||
toSqlTypeHelper(avroSchema, Set.empty, AvroOptions(options)) | ||
} | ||
|
||
// The property specifies Catalyst type of the given field | ||
private val CATALYST_TYPE_PROP_NAME = "spark.sql.catalyst.type" | ||
|
||
private def toSqlTypeHelper(avroSchema: Schema, existingRecordNames: Set[String]): SchemaType = { | ||
private def toSqlTypeHelper( | ||
avroSchema: Schema, | ||
existingRecordNames: Set[String], | ||
avroOptions: AvroOptions): SchemaType = { | ||
avroSchema.getType match { | ||
case INT => avroSchema.getLogicalType match { | ||
case _: Date => SchemaType(DateType, nullable = false) | ||
|
@@ -106,20 +115,23 @@ object SchemaConverters { | |
} | ||
val newRecordNames = existingRecordNames + avroSchema.getFullName | ||
val fields = avroSchema.getFields.asScala.map { f => | ||
val schemaType = toSqlTypeHelper(f.schema(), newRecordNames) | ||
val schemaType = toSqlTypeHelper(f.schema(), newRecordNames, avroOptions) | ||
StructField(f.name, schemaType.dataType, schemaType.nullable) | ||
} | ||
|
||
SchemaType(StructType(fields.toArray), nullable = false) | ||
|
||
case ARRAY => | ||
val schemaType = toSqlTypeHelper(avroSchema.getElementType, existingRecordNames) | ||
val schemaType = toSqlTypeHelper( | ||
avroSchema.getElementType, | ||
existingRecordNames, | ||
avroOptions) | ||
SchemaType( | ||
ArrayType(schemaType.dataType, containsNull = schemaType.nullable), | ||
nullable = false) | ||
|
||
case MAP => | ||
val schemaType = toSqlTypeHelper(avroSchema.getValueType, existingRecordNames) | ||
val schemaType = toSqlTypeHelper(avroSchema.getValueType, existingRecordNames, avroOptions) | ||
SchemaType( | ||
MapType(StringType, schemaType.dataType, valueContainsNull = schemaType.nullable), | ||
nullable = false) | ||
|
@@ -129,26 +141,50 @@ object SchemaConverters { | |
// In case of a union with null, eliminate it and make a recursive call | ||
val remainingUnionTypes = AvroUtils.nonNullUnionBranches(avroSchema) | ||
if (remainingUnionTypes.size == 1) { | ||
toSqlTypeHelper(remainingUnionTypes.head, existingRecordNames).copy(nullable = true) | ||
} else { | ||
toSqlTypeHelper(Schema.createUnion(remainingUnionTypes.asJava), existingRecordNames) | ||
toSqlTypeHelper(remainingUnionTypes.head, existingRecordNames, avroOptions) | ||
.copy(nullable = true) | ||
} else { | ||
toSqlTypeHelper( | ||
Schema.createUnion(remainingUnionTypes.asJava), | ||
existingRecordNames, | ||
avroOptions).copy(nullable = true) | ||
} | ||
} else avroSchema.getTypes.asScala.map(_.getType).toSeq match { | ||
case Seq(t1) => | ||
toSqlTypeHelper(avroSchema.getTypes.get(0), existingRecordNames) | ||
toSqlTypeHelper(avroSchema.getTypes.get(0), existingRecordNames, avroOptions) | ||
case Seq(t1, t2) if Set(t1, t2) == Set(INT, LONG) => | ||
SchemaType(LongType, nullable = false) | ||
case Seq(t1, t2) if Set(t1, t2) == Set(FLOAT, DOUBLE) => | ||
SchemaType(DoubleType, nullable = false) | ||
case _ => | ||
// Convert complex unions to struct types where field names are member0, member1, etc. | ||
// This is consistent with the behavior when converting between Avro and Parquet. | ||
// When avroOptions.useStableIdForUnionType is false, convert complex unions to struct | ||
// types where field names are member0, member1, etc. This is consistent with the | ||
// behavior when converting between Avro and Parquet. | ||
Comment on lines
+161
to
+162
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is Parquet connection here? Should this say "consistent with default behavior before adding support for stable names"?. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the existing comment. I just got in different lines after adding "When avroOptions.useStableIdForUnionType is false" in the beginning. I don't know what it is and I have no reason to doubt it is wrong. |
||
// If avroOptions.useStableIdForUnionType is true, include type name in field names | ||
// so that users can drop or add fields and keep field name stable. | ||
val fieldNameSet : mutable.Set[String] = mutable.Set() | ||
val fields = avroSchema.getTypes.asScala.zipWithIndex.map { | ||
case (s, i) => | ||
val schemaType = toSqlTypeHelper(s, existingRecordNames) | ||
val schemaType = toSqlTypeHelper(s, existingRecordNames, avroOptions) | ||
|
||
val fieldName = if (avroOptions.useStableIdForUnionType) { | ||
// Avro's field name may be case sensitive, so field names for two named type | ||
// could be "a" and "A" and we need to distinguish them. In this case, we throw | ||
// an exception. | ||
val temp_name = s"member_${s.getName.toLowerCase(Locale.ROOT)}" | ||
if (fieldNameSet.contains(temp_name)) { | ||
throw new IncompatibleSchemaException( | ||
"Cannot generate stable indentifier for Avro union type due to name " + | ||
s"conflict of type name ${s.getName}") | ||
} | ||
fieldNameSet.add(temp_name) | ||
temp_name | ||
} else { | ||
s"member$i" | ||
} | ||
|
||
// All fields are nullable because only one of them is set at a time | ||
StructField(s"member$i", schemaType.dataType, nullable = true) | ||
StructField(fieldName, schemaType.dataType, nullable = true) | ||
} | ||
|
||
SchemaType(StructType(fields.toArray), nullable = false) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we add documentation for this? I think Spark conf version had long doc comment. We can reuse that here.