-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-24709][SQL] schema_of_json() - schema inference from an example #21686
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
891f3ce
26f3275
1848a7a
42da3f2
97d93b3
ab82bd8
174f8ab
d77ed45
086f6c1
2ff71e8
064bc5c
56c925d
c993fd1
86f6886
dc35731
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,7 +17,7 @@ | |
|
|
||
| package org.apache.spark.sql.catalyst.expressions | ||
|
|
||
| import java.io.{ByteArrayInputStream, ByteArrayOutputStream, CharArrayWriter, InputStreamReader, StringWriter} | ||
| import java.io._ | ||
|
|
||
| import scala.util.parsing.combinator.RegexParsers | ||
|
|
||
|
|
@@ -28,7 +28,8 @@ import org.apache.spark.sql.catalyst.InternalRow | |
| import org.apache.spark.sql.catalyst.analysis.TypeCheckResult | ||
| import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback | ||
| import org.apache.spark.sql.catalyst.json._ | ||
| import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, BadRecordException, FailFastMode, GenericArrayData, MapData} | ||
| import org.apache.spark.sql.catalyst.json.JsonInferSchema.inferField | ||
| import org.apache.spark.sql.catalyst.util._ | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.unsafe.types.UTF8String | ||
|
|
@@ -525,17 +526,19 @@ case class JsonToStructs( | |
| override def nullable: Boolean = true | ||
|
|
||
| // Used in `FunctionRegistry` | ||
| def this(child: Expression, schema: Expression) = | ||
| def this(child: Expression, schema: Expression, options: Map[String, String]) = | ||
| this( | ||
| schema = JsonExprUtils.validateSchemaLiteral(schema), | ||
| options = Map.empty[String, String], | ||
| schema = JsonExprUtils.evalSchemaExpr(schema), | ||
| options = options, | ||
| child = child, | ||
| timeZoneId = None, | ||
| forceNullableSchema = SQLConf.get.getConf(SQLConf.FROM_JSON_FORCE_NULLABLE_SCHEMA)) | ||
|
|
||
| def this(child: Expression, schema: Expression) = this(child, schema, Map.empty[String, String]) | ||
|
|
||
| def this(child: Expression, schema: Expression, options: Expression) = | ||
| this( | ||
| schema = JsonExprUtils.validateSchemaLiteral(schema), | ||
| schema = JsonExprUtils.evalSchemaExpr(schema), | ||
| options = JsonExprUtils.convertToMapData(options), | ||
| child = child, | ||
| timeZoneId = None, | ||
|
|
@@ -744,11 +747,44 @@ case class StructsToJson( | |
| override def inputTypes: Seq[AbstractDataType] = TypeCollection(ArrayType, StructType) :: Nil | ||
| } | ||
|
|
||
| /** | ||
| * A function infers schema of JSON string. | ||
| */ | ||
| @ExpressionDescription( | ||
| usage = "_FUNC_(json[, options]) - Returns schema in the DDL format of JSON string.", | ||
| examples = """ | ||
| Examples: | ||
| > SELECT _FUNC_('[{"col":0}]'); | ||
| array<struct<col:int>> | ||
| """, | ||
| since = "2.4.0") | ||
| case class SchemaOfJson(child: Expression) | ||
| extends UnaryExpression with String2StringExpression with CodegenFallback { | ||
|
|
||
| private val jsonOptions = new JSONOptions(Map.empty, "UTC") | ||
| private val jsonFactory = new JsonFactory() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for pointing this out. I really didn't know that I have to call the method. |
||
| jsonOptions.setJacksonOptions(jsonFactory) | ||
|
|
||
| override def convert(v: UTF8String): UTF8String = { | ||
| val dt = Utils.tryWithResource(CreateJacksonParser.utf8String(jsonFactory, v)) { parser => | ||
| parser.nextToken() | ||
| inferField(parser, jsonOptions) | ||
| } | ||
|
|
||
| UTF8String.fromString(dt.catalogString) | ||
| } | ||
| } | ||
|
|
||
| object JsonExprUtils { | ||
|
|
||
| def validateSchemaLiteral(exp: Expression): DataType = exp match { | ||
| def evalSchemaExpr(exp: Expression): DataType = exp match { | ||
| case Literal(s, StringType) => DataType.fromDDL(s.toString) | ||
| case e => throw new AnalysisException(s"Expected a string literal instead of $e") | ||
| case e @ SchemaOfJson(_: Literal) => | ||
| val ddlSchema = e.eval().asInstanceOf[UTF8String] | ||
| DataType.fromDDL(ddlSchema.toString) | ||
| case e => throw new AnalysisException( | ||
| "Schema should be specified in DDL format as a string literal" + | ||
| s" or output of the schema_of_json function instead of ${e.sql}") | ||
| } | ||
|
|
||
| def convertToMapData(exp: Expression): Map[String, String] = exp match { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3381,6 +3381,48 @@ object functions { | |
| from_json(e, dataType, options) | ||
| } | ||
|
|
||
| /** | ||
| * (Scala-specific) Parses a column containing a JSON string into a `MapType` with `StringType` | ||
| * as keys type, `StructType` or `ArrayType` of `StructType`s with the specified schema. | ||
| * Returns `null`, in the case of an unparseable string. | ||
| * | ||
| * @param e a string column containing JSON data. | ||
| * @param schema the schema to use when parsing the json string | ||
| * | ||
| * @group collection_funcs | ||
| * @since 2.4.0 | ||
| */ | ||
| def from_json(e: Column, schema: Column): Column = { | ||
| from_json(e, schema, Map.empty[String, String].asJava) | ||
| } | ||
|
|
||
| /** | ||
| * (Java-specific) Parses a column containing a JSON string into a `MapType` with `StringType` | ||
| * as keys type, `StructType` or `ArrayType` of `StructType`s with the specified schema. | ||
| * Returns `null`, in the case of an unparseable string. | ||
| * | ||
| * @param e a string column containing JSON data. | ||
| * @param schema the schema to use when parsing the json string | ||
| * @param options options to control how the json is parsed. accepts the same options and the | ||
| * json data source. | ||
| * | ||
| * @group collection_funcs | ||
| * @since 2.4.0 | ||
| */ | ||
| def from_json(e: Column, schema: Column, options: java.util.Map[String, String]): Column = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let me leave my last comment, #21686 (comment) in case it's missed.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we call the method from python: https://github.com/apache/spark/pull/21686/files#diff-f5295f69bfbdbf6e161aed54057ea36dR2202 Do you really want to revert changes for python?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I see. I am fine then. Thanks. |
||
| withExpr(new JsonToStructs(e.expr, schema.expr, options.asScala.toMap)) | ||
| } | ||
|
|
||
| /** | ||
| * Parses a column containing a JSON string and infers its schema. | ||
| * | ||
| * @param e a string column containing JSON data. | ||
| * | ||
| * @group collection_funcs | ||
| * @since 2.4.0 | ||
| */ | ||
| def schema_of_json(e: Column): Column = withExpr(new SchemaOfJson(e.expr)) | ||
|
|
||
| /** | ||
| * (Scala-specific) Converts a column containing a `StructType`, `ArrayType` of `StructType`s, | ||
| * a `MapType` or `ArrayType` of `MapType`s into a JSON string with the specified schema. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
'''{"a": 0}'''->'{"a": 0}'There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
feel free to fix other examples above too
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mean for other functions too?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nope, I mean the examples here in this function.