-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-18246][SQL] Throws an exception before execution for unsupported types in Json, CSV and text functionailities #15751
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
eb697e1
8c79da7
144cc3b
89a1fc8
ee91348
14c3b0c
d30e218
4c52716
8081f58
f68245c
bae8db8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -143,6 +143,15 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { | |
| val broadcastedHadoopConf = | ||
| sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) | ||
|
|
||
| if (csvOptions.failFast) { | ||
| // We can fail before starting to parse in case of "FAILFAST" mode. In case of "PERMISIVE" | ||
| // mode, it allows to read values as null for unsupported types. In case of "DROPMALFORMED" | ||
| // mode, it drops records only containing non-null values in unsupported types. We should use | ||
| // `requiredSchema` instead of whole schema `dataSchema` here to not to break the original | ||
| // behaviour. | ||
| verifySchema(requiredSchema) | ||
|
||
| } | ||
|
|
||
| (file: PartitionedFile) => { | ||
| val lineIterator = { | ||
| val conf = broadcastedHadoopConf.value.value | ||
|
|
@@ -223,15 +232,15 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { | |
|
|
||
| private def verifySchema(schema: StructType): Unit = { | ||
| def verifyType(dataType: DataType): Unit = dataType match { | ||
| case ByteType | ShortType | IntegerType | LongType | FloatType | | ||
| DoubleType | BooleanType | _: DecimalType | TimestampType | | ||
| DateType | StringType => | ||
| case ByteType | ShortType | IntegerType | LongType | FloatType | | ||
| DoubleType | BooleanType | _: DecimalType | TimestampType | | ||
| DateType | StringType => | ||
|
|
||
| case udt: UserDefinedType[_] => verifyType(udt.sqlType) | ||
| case udt: UserDefinedType[_] => verifyType(udt.sqlType) | ||
|
|
||
| case _ => | ||
| throw new UnsupportedOperationException( | ||
| s"CSV data source does not support ${dataType.simpleString} data type.") | ||
| case _ => | ||
| throw new UnsupportedOperationException( | ||
| s"CSV data source does not support ${dataType.simpleString} data type.") | ||
|
||
| } | ||
|
|
||
| schema.foreach(field => verifyType(field.dataType)) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this case, we don't have to worry about parsing mode,
modebecausefrom_jsonproducesnullwith the default parse mode,FAILFAST.