From c344359df44167abbb1528ee20ff12e730896452 Mon Sep 17 00:00:00 2001
From: Brandon Bradley <bradleytastic@gmail.com>
Date: Tue, 26 Jan 2016 18:47:14 -0600
Subject: [PATCH 1/3] [SPARK-12749][SQL] add json option to parse
 floating-point types as DecimalType

---
 python/pyspark/sql/readwriter.py              |  2 ++
 .../apache/spark/sql/DataFrameReader.scala    |  3 ++
 .../datasources/json/InferSchema.scala        |  9 ++++--
 .../datasources/json/JSONOptions.scala        |  2 ++
 .../datasources/json/JsonSuite.scala          | 28 +++++++++++++++++++
 5 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 0b20022b14b8d..b1453c637f79e 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -152,6 +152,8 @@ def json(self, path, schema=None):
         You can set the following JSON-specific options to deal with non-standard JSON files:
             * ``primitivesAsString`` (default ``false``): infers all primitive values as a string \
                 type
+            * `floatAsBigDecimal` (default `false`): infers all floating-point values as a decimal \
+                type
             * ``allowComments`` (default ``false``): ignores Java/C++ style comment in JSON records
             * ``allowUnquotedFieldNames`` (default ``false``): allows unquoted JSON field names
             * ``allowSingleQuotes`` (default ``true``): allows single quotes in addition to double \
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 634c1bd4739b1..718e87125eea4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -232,6 +232,7 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
    *
    * You can set the following JSON-specific options to deal with non-standard JSON files:
    * <li>`primitivesAsString` (default `false`): infers all primitive values as a string type</li>
+
    * <li>`allowComments` (default `false`): ignores Java/C++ style comment in JSON records</li>
    * <li>`allowUnquotedFieldNames` (default `false`): allows unquoted JSON field names</li>
    * <li>`allowSingleQuotes` (default `true`): allows single quotes in addition to double quotes
@@ -252,6 +253,8 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
    *
    * You can set the following JSON-specific options to deal with non-standard JSON files:
    * <li>`primitivesAsString` (default `false`): infers all primitive values as a string type</li>
+   * <li>`floatAsBigDecimal` (default `false`): infers all floating-point values as a decimal
+   * type</li>
    * <li>`allowComments` (default `false`): ignores Java/C++ style comment in JSON records</li>
    * <li>`allowUnquotedFieldNames` (default `false`): allows unquoted JSON field names</li>
    * <li>`allowSingleQuotes` (default `true`): allows single quotes in addition to double quotes
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
index 44d5e4ff7ec8b..8f6c9661cffe5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.datasources.json
 
 import com.fasterxml.jackson.core._
+import com.fasterxml.jackson.databind.{MappingJsonFactory, DeserializationFeature, ObjectMapper}
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion
@@ -134,8 +135,12 @@ private[json] object InferSchema {
             val v = parser.getDecimalValue
             DecimalType(v.precision(), v.scale())
           case FLOAT | DOUBLE =>
-            // TODO(davies): Should we use decimal if possible?
-            DoubleType
+            if (configOptions.floatAsBigDecimal) {
+              val v = parser.getDecimalValue
+              DecimalType(v.precision(), v.scale())
+            } else {
+              DoubleType
+            }
         }
 
       case VALUE_TRUE | VALUE_FALSE => BooleanType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala
index fe5b20697e40e..31a95ed461215 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala
@@ -34,6 +34,8 @@ private[sql] class JSONOptions(
     parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0)
   val primitivesAsString =
     parameters.get("primitivesAsString").map(_.toBoolean).getOrElse(false)
+  val floatAsBigDecimal =
+    parameters.get("floatAsBigDecimal").map(_.toBoolean).getOrElse(false)
   val allowComments =
     parameters.get("allowComments").map(_.toBoolean).getOrElse(false)
   val allowUnquotedFieldNames =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 00eaeb0d34e87..dd83a0e36f6f7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -771,6 +771,34 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     )
   }
 
+  test("Loading a JSON dataset floatAsBigDecimal returns schema with float types as BigDecimal") {
+    val jsonDF = sqlContext.read.option("floatAsBigDecimal", "true").json(primitiveFieldAndType)
+
+    val expectedSchema = StructType(
+      StructField("bigInteger", DecimalType(20, 0), true) ::
+        StructField("boolean", BooleanType, true) ::
+        StructField("double", DecimalType(17, -292), true) ::
+        StructField("integer", LongType, true) ::
+        StructField("long", LongType, true) ::
+        StructField("null", StringType, true) ::
+        StructField("string", StringType, true) :: Nil)
+
+    assert(expectedSchema === jsonDF.schema)
+
+    jsonDF.registerTempTable("jsonTable")
+
+    checkAnswer(
+      sql("select * from jsonTable"),
+      Row(BigDecimal("92233720368547758070"),
+        true,
+        BigDecimal("1.7976931348623157E308"),
+        10,
+        21474836470L,
+        null,
+        "this is a simple string.")
+    )
+  }
+
   test("Loading a JSON dataset from a text file with SQL") {
     val dir = Utils.createTempDir()
     dir.delete()

From 430833ec501fe1798c0e9fb0c3af82114b1b701c Mon Sep 17 00:00:00 2001
From: Brandon Bradley <bradleytastic@gmail.com>
Date: Thu, 28 Jan 2016 12:20:59 -0600
Subject: [PATCH 2/3] fix unused import

---
 .../spark/sql/execution/datasources/json/InferSchema.scala       | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
index 8f6c9661cffe5..8b773ddfcb656 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.execution.datasources.json
 
 import com.fasterxml.jackson.core._
-import com.fasterxml.jackson.databind.{MappingJsonFactory, DeserializationFeature, ObjectMapper}
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion

From f141097d8f2a307950eec12481d30c12928be855 Mon Sep 17 00:00:00 2001
From: Brandon Bradley <bradleytastic@gmail.com>
Date: Thu, 28 Jan 2016 12:55:11 -0600
Subject: [PATCH 3/3] whitespace fix

---
 .../src/main/scala/org/apache/spark/sql/DataFrameReader.scala    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 718e87125eea4..2e0c6c7df967e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -232,7 +232,6 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
    *
    * You can set the following JSON-specific options to deal with non-standard JSON files:
    * <li>`primitivesAsString` (default `false`): infers all primitive values as a string type</li>
-
    * <li>`allowComments` (default `false`): ignores Java/C++ style comment in JSON records</li>
    * <li>`allowUnquotedFieldNames` (default `false`): allows unquoted JSON field names</li>
    * <li>`allowSingleQuotes` (default `true`): allows single quotes in addition to double quotes