-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-11745][SQL] Enable more JSON parsing options #9724
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -221,22 +221,6 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ | |
|
|
||
| private[this] def isTesting: Boolean = sys.props.contains("spark.testing") | ||
|
|
||
| protected def newProjection( | ||
| expressions: Seq[Expression], inputSchema: Seq[Attribute]): Projection = { | ||
| log.debug(s"Creating Projection: $expressions, inputSchema: $inputSchema") | ||
| try { | ||
| GenerateProjection.generate(expressions, inputSchema) | ||
| } catch { | ||
| case e: Exception => | ||
| if (isTesting) { | ||
| throw e | ||
| } else { | ||
| log.error("Failed to generate projection, fallback to interpret", e) | ||
| new InterpretedProjection(expressions, inputSchema) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| protected def newMutableProjection( | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is now unused. |
||
| expressions: Seq[Expression], inputSchema: Seq[Attribute]): () => MutableProjection = { | ||
| log.debug(s"Creating MutableProj: $expressions, inputSchema: $inputSchema") | ||
|
|
@@ -282,6 +266,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ | |
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Creates a row ordering for the given schema, in natural ascending order. | ||
| */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,33 +25,36 @@ import org.apache.spark.sql.execution.datasources.json.JacksonUtils.nextUntil | |
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.util.Utils | ||
|
|
||
| private[sql] object InferSchema { | ||
|
|
||
| private[json] object InferSchema { | ||
|
|
||
| /** | ||
| * Infer the type of a collection of json records in three stages: | ||
| * 1. Infer the type of each record | ||
| * 2. Merge types by choosing the lowest type necessary to cover equal keys | ||
| * 3. Replace any remaining null fields with string, the top type | ||
| */ | ||
| def apply( | ||
| def infer( | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it was super confusing for apply to return RDD (i.e. it is not a factory method). |
||
| json: RDD[String], | ||
| samplingRatio: Double = 1.0, | ||
| columnNameOfCorruptRecords: String, | ||
| primitivesAsString: Boolean = false): StructType = { | ||
| require(samplingRatio > 0, s"samplingRatio ($samplingRatio) should be greater than 0") | ||
| val schemaData = if (samplingRatio > 0.99) { | ||
| configOptions: JSONOptions): StructType = { | ||
| require(configOptions.samplingRatio > 0, | ||
| s"samplingRatio (${configOptions.samplingRatio}) should be greater than 0") | ||
| val schemaData = if (configOptions.samplingRatio > 0.99) { | ||
| json | ||
| } else { | ||
| json.sample(withReplacement = false, samplingRatio, 1) | ||
| json.sample(withReplacement = false, configOptions.samplingRatio, 1) | ||
| } | ||
|
|
||
| // perform schema inference on each row and merge afterwards | ||
| val rootType = schemaData.mapPartitions { iter => | ||
| val factory = new JsonFactory() | ||
| configOptions.setJacksonOptions(factory) | ||
| iter.map { row => | ||
| try { | ||
| Utils.tryWithResource(factory.createParser(row)) { parser => | ||
| parser.nextToken() | ||
| inferField(parser, primitivesAsString) | ||
| inferField(parser, configOptions) | ||
| } | ||
| } catch { | ||
| case _: JsonParseException => | ||
|
|
@@ -71,14 +74,14 @@ private[sql] object InferSchema { | |
| /** | ||
| * Infer the type of a json document from the parser's token stream | ||
| */ | ||
| private def inferField(parser: JsonParser, primitivesAsString: Boolean): DataType = { | ||
| private def inferField(parser: JsonParser, configOptions: JSONOptions): DataType = { | ||
| import com.fasterxml.jackson.core.JsonToken._ | ||
| parser.getCurrentToken match { | ||
| case null | VALUE_NULL => NullType | ||
|
|
||
| case FIELD_NAME => | ||
| parser.nextToken() | ||
| inferField(parser, primitivesAsString) | ||
| inferField(parser, configOptions) | ||
|
|
||
| case VALUE_STRING if parser.getTextLength < 1 => | ||
| // Zero length strings and nulls have special handling to deal | ||
|
|
@@ -95,7 +98,7 @@ private[sql] object InferSchema { | |
| while (nextUntil(parser, END_OBJECT)) { | ||
| builder += StructField( | ||
| parser.getCurrentName, | ||
| inferField(parser, primitivesAsString), | ||
| inferField(parser, configOptions), | ||
| nullable = true) | ||
| } | ||
|
|
||
|
|
@@ -107,14 +110,15 @@ private[sql] object InferSchema { | |
| // the type as we pass through all JSON objects. | ||
| var elementType: DataType = NullType | ||
| while (nextUntil(parser, END_ARRAY)) { | ||
| elementType = compatibleType(elementType, inferField(parser, primitivesAsString)) | ||
| elementType = compatibleType( | ||
| elementType, inferField(parser, configOptions)) | ||
| } | ||
|
|
||
| ArrayType(elementType) | ||
|
|
||
| case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT) if primitivesAsString => StringType | ||
| case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT) if configOptions.primitivesAsString => StringType | ||
|
|
||
| case (VALUE_TRUE | VALUE_FALSE) if primitivesAsString => StringType | ||
| case (VALUE_TRUE | VALUE_FALSE) if configOptions.primitivesAsString => StringType | ||
|
|
||
| case VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT => | ||
| import JsonParser.NumberType._ | ||
|
|
@@ -178,7 +182,7 @@ private[sql] object InferSchema { | |
| /** | ||
| * Returns the most general data type for two given data types. | ||
| */ | ||
| private[json] def compatibleType(t1: DataType, t2: DataType): DataType = { | ||
| def compatibleType(t1: DataType, t2: DataType): DataType = { | ||
| HiveTypeCoercion.findTightestCommonTypeOfTwo(t1, t2).getOrElse { | ||
| // t1 or t2 is a StructType, ArrayType, or an unexpected type. | ||
| (t1, t2) match { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.execution.datasources.json | ||
|
|
||
| import com.fasterxml.jackson.core.{JsonParser, JsonFactory} | ||
|
|
||
| /** | ||
| * Options for the JSON data source. | ||
| * | ||
| * Most of these map directly to Jackson's internal options, specified in [[JsonParser.Feature]]. | ||
| */ | ||
| case class JSONOptions( | ||
| samplingRatio: Double = 1.0, | ||
| primitivesAsString: Boolean = false, | ||
| allowComments: Boolean = false, | ||
| allowUnquotedFieldNames: Boolean = false, | ||
| allowSingleQuotes: Boolean = true, | ||
| allowNumericLeadingZeros: Boolean = false, | ||
| allowNonNumericNumbers: Boolean = false) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. allowNonNumericNumbers is undocumented for now, since I can't figure out how it works. |
||
|
|
||
| /** Sets config options on a Jackson [[JsonFactory]]. */ | ||
| def setJacksonOptions(factory: JsonFactory): Unit = { | ||
| factory.configure(JsonParser.Feature.ALLOW_COMMENTS, allowComments) | ||
| factory.configure(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES, allowUnquotedFieldNames) | ||
| factory.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, allowSingleQuotes) | ||
| factory.configure(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS, allowNumericLeadingZeros) | ||
| factory.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, allowNonNumericNumbers) | ||
| } | ||
| } | ||
|
|
||
|
|
||
| object JSONOptions { | ||
| def createFromConfigMap(parameters: Map[String, String]): JSONOptions = JSONOptions( | ||
| samplingRatio = | ||
| parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0), | ||
| primitivesAsString = | ||
| parameters.get("primitivesAsString").map(_.toBoolean).getOrElse(false), | ||
| allowComments = | ||
| parameters.get("allowComments").map(_.toBoolean).getOrElse(false), | ||
| allowUnquotedFieldNames = | ||
| parameters.get("allowUnquotedFieldNames").map(_.toBoolean).getOrElse(false), | ||
| allowSingleQuotes = | ||
| parameters.get("allowSingleQuotes").map(_.toBoolean).getOrElse(true), | ||
| allowNumericLeadingZeros = | ||
| parameters.get("allowNumericLeadingZeros").map(_.toBoolean).getOrElse(false), | ||
| allowNonNumericNumbers = | ||
| parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true) | ||
| ) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add
samplingRatio?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we skipped it in the past because it had very little impact on performance, so in most cases it is better to just use 1.0... Maybe we should even deprecate that option.