From c344359df44167abbb1528ee20ff12e730896452 Mon Sep 17 00:00:00 2001 From: Brandon Bradley Date: Tue, 26 Jan 2016 18:47:14 -0600 Subject: [PATCH 1/3] [SPARK-12749][SQL] add json option to parse floating-point types as DecimalType --- python/pyspark/sql/readwriter.py | 2 ++ .../apache/spark/sql/DataFrameReader.scala | 3 ++ .../datasources/json/InferSchema.scala | 9 ++++-- .../datasources/json/JSONOptions.scala | 2 ++ .../datasources/json/JsonSuite.scala | 28 +++++++++++++++++++ 5 files changed, 42 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 0b20022b14b8d..b1453c637f79e 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -152,6 +152,8 @@ def json(self, path, schema=None): You can set the following JSON-specific options to deal with non-standard JSON files: * ``primitivesAsString`` (default ``false``): infers all primitive values as a string \ type + * `floatAsBigDecimal` (default `false`): infers all floating-point values as a decimal \ + type * ``allowComments`` (default ``false``): ignores Java/C++ style comment in JSON records * ``allowUnquotedFieldNames`` (default ``false``): allows unquoted JSON field names * ``allowSingleQuotes`` (default ``true``): allows single quotes in addition to double \ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 634c1bd4739b1..718e87125eea4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -232,6 +232,7 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging { * * You can set the following JSON-specific options to deal with non-standard JSON files: *
  • `primitivesAsString` (default `false`): infers all primitive values as a string type
  • + *
  • `allowComments` (default `false`): ignores Java/C++ style comment in JSON records
  • *
  • `allowUnquotedFieldNames` (default `false`): allows unquoted JSON field names
  • *
  • `allowSingleQuotes` (default `true`): allows single quotes in addition to double quotes @@ -252,6 +253,8 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging { * * You can set the following JSON-specific options to deal with non-standard JSON files: *
  • `primitivesAsString` (default `false`): infers all primitive values as a string type
  • + *
  • `floatAsBigDecimal` (default `false`): infers all floating-point values as a decimal + * type
  • *
  • `allowComments` (default `false`): ignores Java/C++ style comment in JSON records
  • *
  • `allowUnquotedFieldNames` (default `false`): allows unquoted JSON field names
  • *
  • `allowSingleQuotes` (default `true`): allows single quotes in addition to double quotes diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala index 44d5e4ff7ec8b..8f6c9661cffe5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.execution.datasources.json import com.fasterxml.jackson.core._ +import com.fasterxml.jackson.databind.{MappingJsonFactory, DeserializationFeature, ObjectMapper} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion @@ -134,8 +135,12 @@ private[json] object InferSchema { val v = parser.getDecimalValue DecimalType(v.precision(), v.scale()) case FLOAT | DOUBLE => - // TODO(davies): Should we use decimal if possible? - DoubleType + if (configOptions.floatAsBigDecimal) { + val v = parser.getDecimalValue + DecimalType(v.precision(), v.scale()) + } else { + DoubleType + } } case VALUE_TRUE | VALUE_FALSE => BooleanType diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala index fe5b20697e40e..31a95ed461215 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala @@ -34,6 +34,8 @@ private[sql] class JSONOptions( parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0) val primitivesAsString = parameters.get("primitivesAsString").map(_.toBoolean).getOrElse(false) + val floatAsBigDecimal = + parameters.get("floatAsBigDecimal").map(_.toBoolean).getOrElse(false) val allowComments = parameters.get("allowComments").map(_.toBoolean).getOrElse(false) val allowUnquotedFieldNames = diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 00eaeb0d34e87..dd83a0e36f6f7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -771,6 +771,34 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { ) } + test("Loading a JSON dataset floatAsBigDecimal returns schema with float types as BigDecimal") { + val jsonDF = sqlContext.read.option("floatAsBigDecimal", "true").json(primitiveFieldAndType) + + val expectedSchema = StructType( + StructField("bigInteger", DecimalType(20, 0), true) :: + StructField("boolean", BooleanType, true) :: + StructField("double", DecimalType(17, -292), true) :: + StructField("integer", LongType, true) :: + StructField("long", LongType, true) :: + StructField("null", StringType, true) :: + StructField("string", StringType, true) :: Nil) + + assert(expectedSchema === jsonDF.schema) + + jsonDF.registerTempTable("jsonTable") + + checkAnswer( + sql("select * from jsonTable"), + Row(BigDecimal("92233720368547758070"), + true, + BigDecimal("1.7976931348623157E308"), + 10, + 21474836470L, + null, + "this is a simple string.") + ) + } + test("Loading a JSON dataset from a text file with SQL") { val dir = Utils.createTempDir() dir.delete() From 430833ec501fe1798c0e9fb0c3af82114b1b701c Mon Sep 17 00:00:00 2001 From: Brandon Bradley Date: Thu, 28 Jan 2016 12:20:59 -0600 Subject: [PATCH 2/3] fix unused import --- .../spark/sql/execution/datasources/json/InferSchema.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala index 8f6c9661cffe5..8b773ddfcb656 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.execution.datasources.json import com.fasterxml.jackson.core._ -import com.fasterxml.jackson.databind.{MappingJsonFactory, DeserializationFeature, ObjectMapper} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion From f141097d8f2a307950eec12481d30c12928be855 Mon Sep 17 00:00:00 2001 From: Brandon Bradley Date: Thu, 28 Jan 2016 12:55:11 -0600 Subject: [PATCH 3/3] whitespace fix --- .../src/main/scala/org/apache/spark/sql/DataFrameReader.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 718e87125eea4..2e0c6c7df967e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -232,7 +232,6 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging { * * You can set the following JSON-specific options to deal with non-standard JSON files: *
  • `primitivesAsString` (default `false`): infers all primitive values as a string type
  • - *
  • `allowComments` (default `false`): ignores Java/C++ style comment in JSON records
  • *
  • `allowUnquotedFieldNames` (default `false`): allows unquoted JSON field names
  • *
  • `allowSingleQuotes` (default `true`): allows single quotes in addition to double quotes