From 5654c9d5289300620e9674ce1147c115526bd314 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 29 Jan 2015 17:41:18 -0800
Subject: [PATCH 01/14] Draft version of Parquet partition discovery and schema
 merging

---
 .../apache/spark/deploy/SparkHadoopUtil.scala |  17 +-
 .../spark/sql/parquet/ParquetTypes.scala      |  64 ++-
 .../apache/spark/sql/parquet/newParquet.scala | 430 ++++++++++++------
 3 files changed, 373 insertions(+), 138 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index d68854214ef0..03238e9fa008 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -21,7 +21,7 @@ import java.lang.reflect.Method
 import java.security.PrivilegedExceptionAction
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.fs.FileSystem.Statistics
 import org.apache.hadoop.mapred.JobConf
 import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
@@ -191,6 +191,21 @@ class SparkHadoopUtil extends Logging {
     val method = context.getClass.getMethod("getConfiguration")
     method.invoke(context).asInstanceOf[Configuration]
   }
+
+  /**
+   * Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the
+   * given path points to a file, return a single-element collection containing [[FileStatus]] of
+   * that file.
+   */
+  def listLeafStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = {
+    def recurse(path: Path) = {
+      val (directories, leaves) = fs.listStatus(path).partition(_.isDir)
+      leaves ++ directories.flatMap(f => listLeafStatuses(fs, f.getPath))
+    }
+
+    val baseStatus = fs.getFileStatus(basePath)
+    if (baseStatus.isDir) recurse(basePath) else Array(baseStatus)
+  }
 }
 
 object SparkHadoopUtil {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
index b646109b7c55..73e43611707c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -19,24 +19,23 @@ package org.apache.spark.sql.parquet
 
 import java.io.IOException
 
+import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.mapreduce.Job
-
 import parquet.format.converter.ParquetMetadataConverter
-import parquet.hadoop.{ParquetFileReader, Footer, ParquetFileWriter}
-import parquet.hadoop.metadata.{ParquetMetadata, FileMetaData}
+import parquet.hadoop.metadata.{FileMetaData, ParquetMetadata}
 import parquet.hadoop.util.ContextUtil
-import parquet.schema.{Type => ParquetType, Types => ParquetTypes, PrimitiveType => ParquetPrimitiveType, MessageType}
-import parquet.schema.{GroupType => ParquetGroupType, OriginalType => ParquetOriginalType, ConversionPatterns, DecimalMetadata}
+import parquet.hadoop.{Footer, ParquetFileReader, ParquetFileWriter}
 import parquet.schema.PrimitiveType.{PrimitiveTypeName => ParquetPrimitiveTypeName}
 import parquet.schema.Type.Repetition
+import parquet.schema.{ConversionPatterns, DecimalMetadata, GroupType => ParquetGroupType, MessageType, OriginalType => ParquetOriginalType, PrimitiveType => ParquetPrimitiveType, Type => ParquetType, Types => ParquetTypes}
 
-import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Attribute}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.types._
+import org.apache.spark.{Logging, SparkException}
 
 // Implicits
 import scala.collection.JavaConversions._
@@ -523,4 +522,55 @@ private[parquet] object ParquetTypesConverter extends Logging {
       attributes
     }
   }
+
+  def mergeCatalystSchemas(left: StructType, right: StructType): StructType =
+    mergeCatalystDataTypes(left, right).asInstanceOf[StructType]
+
+  def mergeCatalystDataTypes(left: DataType, right: DataType): DataType =
+    (left, right) match {
+      case (ArrayType(leftElementType, leftContainsNull),
+            ArrayType(rightElementType, rightContainsNull)) =>
+        ArrayType(
+          mergeCatalystDataTypes(leftElementType, rightElementType),
+          leftContainsNull || rightContainsNull)
+
+      case (MapType(leftKeyType, leftValueType, leftContainsNull),
+            MapType(rightKeyType, rightValueType, rightContainsNull)) =>
+        MapType(
+          mergeCatalystDataTypes(leftKeyType, rightKeyType),
+          mergeCatalystDataTypes(leftValueType, rightValueType),
+          leftContainsNull || rightContainsNull)
+
+      case (StructType(leftFields), StructType(rightFields)) =>
+        val newFields = ArrayBuffer.empty[StructField]
+
+        leftFields.foreach {
+          case leftField @ StructField(leftName, leftType, leftNullable, leftMetadata) =>
+            rightFields
+              .find(_.name == leftName)
+              .map { case rightField @ StructField(_, rightType, rightNullable, rightMeatadata) =>
+                leftField.copy(
+                  dataType = mergeCatalystDataTypes(leftType, rightType),
+                  nullable = leftNullable || rightNullable)
+              }
+              .orElse(Some(leftField))
+              .foreach(newFields += _)
+        }
+
+        rightFields
+          .filterNot(f => leftFields.map(_.name).contains(f.name))
+          .foreach(newFields += _)
+
+        StructType(newFields)
+
+      case (DecimalType.Fixed(leftPrecision, leftScale),
+            DecimalType.Fixed(rightPrecision, rightScale)) =>
+        DecimalType(leftPrecision.max(rightPrecision), leftScale.max(rightScale))
+
+      case (leftType, rightType) if leftType == rightType =>
+        leftType
+
+      case _ =>
+        throw new SparkException(s"Failed to merge incompatible data types $left and $right")
+    }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 179c0d6b2223..25a659fe9ccb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -16,25 +16,33 @@
  */
 package org.apache.spark.sql.parquet
 
+import java.lang.{Double => JDouble, Float => JFloat, Long => JLong}
+import java.math.{BigDecimal => JBigDecimal}
+import java.net.URI
 import java.util.{List => JList}
 
 import scala.collection.JavaConversions._
+import scala.util.Try
 
-import org.apache.hadoop.conf.{Configurable, Configuration}
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.io.Writable
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.hadoop.mapreduce.{InputSplit, Job, JobContext}
 import parquet.filter2.predicate.FilterApi
-import parquet.hadoop.ParquetInputFormat
+import parquet.format.converter.ParquetMetadataConverter
+import parquet.hadoop.{ParquetInputFormat, _}
 import parquet.hadoop.util.ContextUtil
 
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{NewHadoopPartition, RDD}
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.parquet.ParquetTypesConverter._
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.types.{IntegerType, StructField, StructType, _}
 import org.apache.spark.sql.{Row, SQLConf, SQLContext}
-import org.apache.spark.{Logging, Partition => SparkPartition}
+import org.apache.spark.{Logging, Partition => SparkPartition, SparkException}
 
 
 /**
@@ -48,14 +56,16 @@ class DefaultSource extends RelationProvider {
   override def createRelation(
       sqlContext: SQLContext,
       parameters: Map[String, String]): BaseRelation = {
-    val path =
-      parameters.getOrElse("path", sys.error("'path' must be specified for parquet tables."))
+    val path = parameters.getOrElse("path",
+      sys.error("'path' must be specified for parquet tables."))
 
-    ParquetRelation2(path)(sqlContext)
+    ParquetRelation2(path, parameters)(sqlContext)
   }
 }
 
-private[parquet] case class Partition(partitionValues: Map[String, Any], files: Seq[FileStatus])
+private[parquet] case class Partition(values: Row, path: String)
+
+private[parquet] case class PartitionSpec(partitionColumns: StructType, partitions: Seq[Partition])
 
 /**
  * An alternative to [[ParquetRelation]] that plugs in using the data sources API.  This class is
@@ -81,117 +91,169 @@ private[parquet] case class Partition(partitionValues: Map[String, Any], files:
  * discovery.
  */
 @DeveloperApi
-case class ParquetRelation2(path: String)(@transient val sqlContext: SQLContext)
+case class ParquetRelation2
+    (path: String, parameters: Map[String, String])
+    (@transient val sqlContext: SQLContext)
   extends CatalystScan with Logging {
 
+  // Should we merge schemas from all Parquet part-files?
+  private val shouldMergeSchemas =
+    parameters.getOrElse("parquet.schema.merge", "true").toBoolean
+
   def sparkContext = sqlContext.sparkContext
 
-  // Minor Hack: scala doesnt seem to respect @transient for vals declared via extraction
-  @transient
-  private var partitionKeys: Seq[String] = _
-  @transient
-  private var partitions: Seq[Partition] = _
-  discoverPartitions()
+  private val fs = FileSystem.get(new URI(path), sparkContext.hadoopConfiguration)
 
-  // TODO: Only finds the first partition, assumes the key is of type Integer...
-  private def discoverPartitions() = {
-    val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
-    val partValue = "([^=]+)=([^=]+)".r
+  private val qualifiedBasePath = fs.makeQualified(new Path(path))
 
-    val childrenOfPath = fs.listStatus(new Path(path)).filterNot(_.getPath.getName.startsWith("_"))
-    val childDirs = childrenOfPath.filter(s => s.isDir)
+  // Cache `FileStatus` objects for Parquet data files, "_metadata", and "_common_metadata".
+  private val (dataFiles, metadataFile, commonMetadataFile) = {
+    val leaves = SparkHadoopUtil.get.listLeafStatuses(fs, qualifiedBasePath).filter { f =>
+      isSummaryFile(f.getPath) ||
+        (!f.getPath.getName.startsWith("_") && !f.getPath.getName.startsWith("."))
+    }
 
-    if (childDirs.size > 0) {
-      val partitionPairs = childDirs.map(_.getPath.getName).map {
-        case partValue(key, value) => (key, value)
-      }
+    assert(leaves.nonEmpty, s"$qualifiedBasePath is either an empty folder or nonexistent.")
 
-      val foundKeys = partitionPairs.map(_._1).distinct
-      if (foundKeys.size > 1) {
-        sys.error(s"Too many distinct partition keys: $foundKeys")
-      }
+    (leaves.filterNot(f => isSummaryFile(f.getPath)),
+      leaves.find(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE),
+      leaves.find(_.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE))
+  }
 
-      // Do a parallel lookup of partition metadata.
-      val partitionFiles =
-        childDirs.par.map { d =>
-          fs.listStatus(d.getPath)
-            // TODO: Is there a standard hadoop function for this?
-            .filterNot(_.getPath.getName.startsWith("_"))
-            .filterNot(_.getPath.getName.startsWith("."))
-        }.seq
-
-      partitionKeys = foundKeys.toSeq
-      partitions = partitionFiles.zip(partitionPairs).map { case (files, (key, value)) =>
-        Partition(Map(key -> value.toInt), files)
-      }.toSeq
+  private val PartitionSpec(partitionColumns, partitions) = {
+    val partitionDirPaths = dataFiles
+      .map(f => fs.makeQualified(f.getPath.getParent))
+      .filterNot(_ == qualifiedBasePath)
+      .distinct
+
+    if (partitionDirPaths.nonEmpty) {
+      ParquetRelation2.parsePartitions(qualifiedBasePath, partitionDirPaths)
     } else {
-      partitionKeys = Nil
-      partitions = Partition(Map.empty, childrenOfPath) :: Nil
+      // No partition directories found, makes a pseudo single-partition specification
+      PartitionSpec(
+        StructType(Seq.empty[StructField]),
+        Seq(Partition(EmptyRow, qualifiedBasePath.toString)))
     }
   }
 
-  override val sizeInBytes = partitions.flatMap(_.files).map(_.getLen).sum
+  private def isPartitioned = partitionColumns.nonEmpty
 
-  val dataSchema = StructType.fromAttributes( // TODO: Parquet code should not deal with attributes.
-    ParquetTypesConverter.readSchemaFromFile(
-      partitions.head.files.head.getPath,
-      Some(sparkContext.hadoopConfiguration),
-      sqlContext.conf.isParquetBinaryAsString,
-      sqlContext.conf.isParquetINT96AsTimestamp))
+  private val footers = {
+    // TODO Issue a Spark job to gather footers if there are too many files
+    (dataFiles ++ metadataFile ++ commonMetadataFile).par.map { f =>
+      val parquetMetadata = ParquetFileReader.readFooter(
+        sparkContext.hadoopConfiguration, f, ParquetMetadataConverter.NO_FILTER)
+      f -> new Footer(f.getPath, parquetMetadata)
+    }.seq.toMap
+  }
 
-  val dataIncludesKey =
-    partitionKeys.headOption.map(dataSchema.fieldNames.contains(_)).getOrElse(true)
+  private def readSchema(): StructType = {
+    // Figures out which file(s) we need to touch in order to retrieve the schema.
+    val filesToTouch =
+      // Always tries the summary files first if users don't require a merged schema.  In this case,
+      // "_common_metadata" is more preferable than "_metadata" because it doesn't contain row
+      // groups information, and could be much smaller for large Parquet files with lots of row
+      // groups.
+      //
+      // NOTE: Metadata stored in the summary files are merged from all part-files.  However, for
+      // user defined key-value metadata (in which we store Spark SQL schema), Parquet doesn't know
+      // how to merge them correctly if some key is associated with different values in different
+      // part-files.  When this happens, Parquet simply gives up generating the summary file.  This
+      // implies that if a summary file presents, then:
+      //
+      //   1. Either all part-files have exactly the same Spark SQL schema, or
+      //   2. Some part-files don't contain Spark SQL schema in the key-value metadata at all (thus
+      //      their schemas may differ from each other).
+      //
+      // Here we tend to be pessimistic and take the second case into account.  Basically this means
+      // we can't trust the summary files if users require a merged schema, and must touch all part-
+      // files to do the merge.
+      if (shouldMergeSchemas) {
+        dataFiles
+      } else {
+        commonMetadataFile
+          .orElse(metadataFile)
+          // Summary file(s) not found, falls back to the first part-file.
+          .orElse(dataFiles.headOption).toSeq
+      }
 
-  override val schema =
-    if (dataIncludesKey) {
-      dataSchema
-    } else {
-      StructType(dataSchema.fields :+ StructField(partitionKeys.head, IntegerType))
+    filesToTouch.map { file =>
+      val metadata = footers(file).getParquetMetadata.getFileMetaData
+      val parquetSchema = metadata.getSchema
+      val maybeSparkSchema = metadata
+        .getKeyValueMetaData
+        .toMap
+        .get(RowReadSupport.SPARK_METADATA_KEY)
+        .map(DataType.fromJson(_).asInstanceOf[StructType])
+
+      maybeSparkSchema.getOrElse {
+        // Falls back to Parquet schema if Spark SQL schema is absent.
+        StructType.fromAttributes(
+          // TODO Really no need to use `Attribute` here, we only need to know the data type.
+          convertToAttributes(parquetSchema, sqlContext.conf.isParquetBinaryAsString))
+      }
+    }.reduce { (left, right) =>
+      try mergeCatalystSchemas(left, right) catch { case e: Throwable =>
+        throw new SparkException(s"Failed to merge incompatible schemas $left and $right", e)
+      }
     }
+  }
 
-  override def buildScan(output: Seq[Attribute], predicates: Seq[Expression]): RDD[Row] = {
-    // This is mostly a hack so that we can use the existing parquet filter code.
-    val requiredColumns = output.map(_.name)
+  private def isSummaryFile(file: Path): Boolean = {
+    file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
+      file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
+  }
+
+  // TODO Should calculate per scan size
+  // It's common that a query only scans a fraction of a large Parquet file.  Returning size of the
+  // whole Parquet file disables some optimizations in this case (e.g. broadcast join).
+  override val sizeInBytes = partitions.map { part =>
+    dataFiles.find(_.getPath.getParent.toString == part.path).get.getLen
+  }.sum
+
+  private val dataSchema = readSchema()
+
+  private val dataSchemaIncludesPartitionKeys =
+    partitionColumns.forall(f => dataSchema.fieldNames.contains(f.name))
 
+  override val schema = if (dataSchemaIncludesPartitionKeys) {
+    dataSchema
+  } else {
+    StructType(dataSchema.fields ++ partitionColumns.fields)
+  }
+
+  // This is mostly a hack so that we can use the existing parquet filter code.
+  override def buildScan(output: Seq[Attribute], predicates: Seq[Expression]): RDD[Row] = {
     val job = new Job(sparkContext.hadoopConfiguration)
     ParquetInputFormat.setReadSupportClass(job, classOf[RowReadSupport])
     val jobConf: Configuration = ContextUtil.getConfiguration(job)
 
-    val requestedSchema = StructType(requiredColumns.map(schema(_)))
-
-    val partitionKeySet = partitionKeys.toSet
-    val rawPredicate =
+    val partitionKeySet = partitionColumns.map(_.name).toSet
+    val partitionPruningPredicate =
       predicates
         .filter(_.references.map(_.name).toSet.subsetOf(partitionKeySet))
         .reduceOption(And)
         .getOrElse(Literal(true))
 
-    // Translate the predicate so that it reads from the information derived from the
-    // folder structure
-    val castedPredicate = rawPredicate transform {
+    val pruningCondition = InterpretedPredicate(partitionPruningPredicate transform {
       case a: AttributeReference =>
-        val idx = partitionKeys.indexWhere(a.name == _)
-        BoundReference(idx, IntegerType, nullable = true)
-    }
+        val idx = partitionColumns.indexWhere(a.name == _.name)
+        BoundReference(idx, partitionColumns(idx).dataType, nullable = true)
+    })
 
-    val inputData = new GenericMutableRow(partitionKeys.size)
-    val pruningCondition = InterpretedPredicate(castedPredicate)
+    val selectedPartitions = if (isPartitioned && predicates.nonEmpty) {
+      partitions.filter(p => pruningCondition(p.values))
+    } else {
+      partitions
+    }
 
-    val selectedPartitions =
-      if (partitionKeys.nonEmpty && predicates.nonEmpty) {
-        partitions.filter { part =>
-          inputData(0) = part.partitionValues.values.head
-          pruningCondition(inputData)
-        }
-      } else {
-        partitions
-      }
+    val selectedFiles = selectedPartitions.flatMap { p =>
+      dataFiles.filter(_.getPath.getParent.toString == p.path)
+    }
 
-    val fs = FileSystem.get(new java.net.URI(path), sparkContext.hadoopConfiguration)
-    val selectedFiles = selectedPartitions.flatMap(_.files).map(f => fs.makeQualified(f.getPath))
     // FileInputFormat cannot handle empty lists.
     if (selectedFiles.nonEmpty) {
-      org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(job, selectedFiles: _*)
+      FileInputFormat.setInputPaths(job, selectedFiles.map(_.getPath): _*)
     }
 
     // Push down filters when possible. Notice that not all filters can be converted to Parquet
@@ -206,13 +268,16 @@ case class ParquetRelation2(path: String)(@transient val sqlContext: SQLContext)
     def percentRead = selectedPartitions.size.toDouble / partitions.size.toDouble * 100
     logInfo(s"Reading $percentRead% of $path partitions")
 
+    val requiredColumns = output.map(_.name)
+    val requestedSchema = StructType(requiredColumns.map(schema(_)))
+
     // Store both requested and original schema in `Configuration`
     jobConf.set(
       RowReadSupport.SPARK_ROW_REQUESTED_SCHEMA,
-      ParquetTypesConverter.convertToString(requestedSchema.toAttributes))
+      convertToString(requestedSchema.toAttributes))
     jobConf.set(
       RowWriteSupport.SPARK_ROW_SCHEMA,
-      ParquetTypesConverter.convertToString(schema.toAttributes))
+      convertToString(schema.toAttributes))
 
     // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
     val useCache = sqlContext.getConf(SQLConf.PARQUET_CACHE_METADATA, "true").toBoolean
@@ -228,62 +293,51 @@ case class ParquetRelation2(path: String)(@transient val sqlContext: SQLContext)
         val cacheMetadata = useCache
 
         @transient
-        val cachedStatus = selectedPartitions.flatMap(_.files)
+        val cachedStatus = selectedFiles
 
         // Overridden so we can inject our own cached files statuses.
         override def getPartitions: Array[SparkPartition] = {
-          val inputFormat =
-            if (cacheMetadata) {
-              new FilteringParquetRowInputFormat {
-                override def listStatus(jobContext: JobContext): JList[FileStatus] = cachedStatus
-              }
-            } else {
-              new FilteringParquetRowInputFormat
+          val inputFormat = if (cacheMetadata) {
+            new FilteringParquetRowInputFormat {
+              override def listStatus(jobContext: JobContext): JList[FileStatus] = cachedStatus
             }
-
-          inputFormat match {
-            case configurable: Configurable =>
-              configurable.setConf(getConf)
-            case _ =>
+          } else {
+            new FilteringParquetRowInputFormat
           }
+
           val jobContext = newJobContext(getConf, jobId)
-          val rawSplits = inputFormat.getSplits(jobContext).toArray
-          val result = new Array[SparkPartition](rawSplits.size)
-          for (i <- 0 until rawSplits.size) {
-            result(i) =
-              new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
+          val rawSplits = inputFormat.getSplits(jobContext)
+
+          Array.tabulate[SparkPartition](rawSplits.size) { i =>
+            new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
           }
-          result
         }
       }
 
-    // The ordinal for the partition key in the result row, if requested.
-    val partitionKeyLocation =
-      partitionKeys
-        .headOption
-        .map(requiredColumns.indexOf(_))
-        .getOrElse(-1)
+    // The ordinals for partition keys in the result row, if requested.
+    val partitionKeyLocations = partitionColumns.fieldNames.zipWithIndex.map {
+      case (name, index) => index -> requiredColumns.indexOf(name)
+    }.toMap.filter {
+      case (_, index) => index >= 0
+    }
 
     // When the data does not include the key and the key is requested then we must fill it in
     // based on information from the input split.
-    if (!dataIncludesKey && partitionKeyLocation != -1) {
-      baseRDD.mapPartitionsWithInputSplit { case (split, iter) =>
-        val partValue = "([^=]+)=([^=]+)".r
-        val partValues =
-          split.asInstanceOf[parquet.hadoop.ParquetInputSplit]
-            .getPath
-            .toString
-            .split("/")
-            .flatMap {
-            case partValue(key, value) => Some(key -> value)
-            case _ => None
-          }.toMap
-
-        val currentValue = partValues.values.head.toInt
-        iter.map { pair =>
-          val res = pair._2.asInstanceOf[SpecificMutableRow]
-          res.setInt(partitionKeyLocation, currentValue)
-          res
+    if (!dataSchemaIncludesPartitionKeys && partitionKeyLocations.nonEmpty) {
+      baseRDD.mapPartitionsWithInputSplit { case (split: ParquetInputSplit, iterator) =>
+        val partValues = selectedPartitions.collectFirst {
+          case p if split.getPath.getParent.toString == p.path => p.values
+        }.get
+
+        iterator.map { pair =>
+          val row = pair._2.asInstanceOf[SpecificMutableRow]
+          var i = 0
+          while (i < partValues.size) {
+            // TODO Avoids boxing cost here!
+            row.update(partitionKeyLocations(i), partValues(i))
+            i += 1
+          }
+          row
         }
       }
     } else {
@@ -291,3 +345,119 @@ case class ParquetRelation2(path: String)(@transient val sqlContext: SQLContext)
     }
   }
 }
+
+object ParquetRelation2 {
+  // TODO Data source implementations shouldn't touch Catalyst types (`Literal`).
+  // However, we are already using Catalyst expressions for partition pruning and predicate
+  // push-down here...
+  case class PartitionDesc(columnNames: Seq[String], literals: Seq[Literal]) {
+    require(columnNames.size == literals.size)
+  }
+
+  /**
+   * Given a base path and all data file paths in it, returns a partition specification.
+   */
+  private[parquet] def parsePartitions(basePath: Path, dataPaths: Seq[Path]): PartitionSpec = {
+    val partitionDescs = resolvePartitions(dataPaths.map(parsePartition(basePath, _)))
+    val PartitionDesc(columnNames, columnLiterals) = partitionDescs.head
+    val fields = columnNames.zip(columnLiterals).map { case (name, Literal(_, dataType)) =>
+      StructField(name, dataType, nullable = true)
+    }
+
+    val partitions = (partitionDescs, dataPaths).zipped.map { (desc, path) =>
+      val values = desc.literals.map(_.value)
+      Partition(Row(values: _*), path.toString)
+    }
+
+    PartitionSpec(StructType(fields), partitions)
+  }
+
+  /**
+   * Parses a single partition, returns column names and values of each partition column.  For
+   * example, given:
+   * {{{
+   *   basePath = hdfs://host:9000/base/path/
+   *   dataPath = hdfs://host:9000/base/path/a=42/b=hello/c=3.14
+   * }}}
+   * we have:
+   * {{{
+   *   PartitionSpec(
+   *     Seq("a", "b", "c"),
+   *     Seq(
+   *       Literal(42, IntegerType),
+   *       Literal("hello", StringType),
+   *       Literal(3.14, FloatType)))
+   * }}}
+   */
+  private[parquet] def parsePartition(basePath: Path, dataPath: Path): PartitionDesc = {
+    val rawSpec = dataPath.toString.stripPrefix(basePath.toString).stripPrefix(Path.SEPARATOR)
+    val (columnNames, values) = rawSpec.split(Path.SEPARATOR).map { column =>
+      val equalSignIndex = column.indexOf('=')
+      assert(equalSignIndex > 0, s"Invalid partition column spec '$column' found in $dataPath")
+      val columnName = rawSpec.take(equalSignIndex)
+      val literal = inferPartitionColumnValue(rawSpec.drop(equalSignIndex + 1))
+      columnName -> literal
+    }.unzip
+
+    PartitionDesc(columnNames, values)
+  }
+
+  /**
+   * Resolves possible type conflicts between partitions by up-casting "lower" types.  The up-
+   * casting order is:
+   * {{{
+   *   IntegerType -> LongType -> FloatType -> DoubleType -> DecimalType.Unlimited -> StringType
+   * }}}
+   */
+  private[parquet] def resolvePartitions(descs: Seq[PartitionDesc]): Seq[PartitionDesc] = {
+    val distinctColNamesOfPartitions = descs.map(_.columnNames).distinct
+    val columnCount = descs.head.columnNames.size
+
+    // Column names of all partitions must match
+    assert(distinctColNamesOfPartitions.size == 1, {
+      val list = distinctColNamesOfPartitions.mkString("\t", "\n", "")
+      s"Conflicting partition column names detected:\n$list"
+    })
+
+    // Resolves possible type conflicts for each column
+    val resolvedValues = (0 until columnCount).map { i =>
+      resolveTypeConflicts(descs.map(_.literals(i)))
+    }
+
+    // Fills resolved literals back to each partition
+    descs.zipWithIndex.map { case (d, index) =>
+      d.copy(literals = resolvedValues.map(_(index)))
+    }
+  }
+
+  /**
+   * Converts a string to a `Literal` with automatic type inference.  Currently only supports
+   * [[IntegerType]], [[LongType]], [[FloatType]], [[DoubleType]], [[DecimalType.Unlimited]], and
+   * [[StringType]].
+   */
+  private[parquet] def inferPartitionColumnValue(raw: String): Literal = {
+    // First tries integral types
+    Try(Literal(Integer.parseInt(raw), IntegerType))
+      .orElse(Try(Literal(JLong.parseLong(raw), LongType)))
+      // Then falls back to fractional types
+      .orElse(Try(Literal(JFloat.parseFloat(raw), FloatType)))
+      .orElse(Try(Literal(JDouble.parseDouble(raw), DoubleType)))
+      .orElse(Try(Literal(new JBigDecimal(raw), DecimalType.Unlimited)))
+      // Then falls back to string
+      .getOrElse(Literal(raw, StringType))
+  }
+
+  private val upCastingOrder: Seq[DataType] =
+    Seq(IntegerType, LongType, FloatType, DoubleType, DecimalType.Unlimited, StringType)
+
+  /**
+   * Given a collection of [[Literal]]s, resolves possible type conflicts by up-casting "lower"
+   * types.
+   */
+  private def resolveTypeConflicts(literals: Seq[Literal]): Seq[Literal] = {
+    val desiredType = literals.map(_.dataType).maxBy(upCastingOrder.indexOf(_))
+    literals.map { case l @ Literal(_, dataType) =>
+      Literal(Cast(l, desiredType).eval(), desiredType)
+    }
+  }
+}

From a1896c7e074565eabb1abf79df5802749502179d Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sat, 31 Jan 2015 16:23:27 -0800
Subject: [PATCH 02/14] Fixes all existing Parquet test suites except for
 ParquetMetastoreSuite

---
 .../sql/catalyst/expressions/predicates.scala |   7 +-
 .../scala/org/apache/spark/sql/SQLConf.scala  |   5 +
 .../org/apache/spark/sql/SQLContext.scala     |   2 +-
 .../apache/spark/sql/parquet/newParquet.scala |  25 +++--
 .../sql/parquet/ParquetFilterSuite.scala      | 104 ++++++++++--------
 .../spark/sql/parquet/ParquetQuerySuite.scala |   2 +-
 6 files changed, 85 insertions(+), 60 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index c84cc95520a1..365b1685a8e7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.types.BooleanType
+import org.apache.spark.sql.types.{BinaryType, BooleanType}
 
 object InterpretedPredicate {
   def apply(expression: Expression, inputSchema: Seq[Attribute]): (Row => Boolean) =
@@ -175,7 +175,10 @@ case class EqualTo(left: Expression, right: Expression) extends BinaryComparison
       null
     } else {
       val r = right.eval(input)
-      if (r == null) null else l == r
+      if (r == null) null
+      else if (left.dataType != BinaryType) l == r
+      else BinaryType.ordering.compare(
+        l.asInstanceOf[Array[Byte]], r.asInstanceOf[Array[Byte]]) == 0
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 7fe17944a734..0c7624a5d7d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -37,6 +37,7 @@ private[spark] object SQLConf {
   val PARQUET_CACHE_METADATA = "spark.sql.parquet.cacheMetadata"
   val PARQUET_COMPRESSION = "spark.sql.parquet.compression.codec"
   val PARQUET_FILTER_PUSHDOWN_ENABLED = "spark.sql.parquet.filterPushdown"
+  val PARQUET_USE_DATA_SOURCE_API = "spark.sql.parquet.useDataSourceApi"
 
   val COLUMN_NAME_OF_CORRUPT_RECORD = "spark.sql.columnNameOfCorruptRecord"
   val BROADCAST_TIMEOUT = "spark.sql.broadcastTimeout"
@@ -105,6 +106,10 @@ private[sql] class SQLConf extends Serializable {
   private[spark] def parquetFilterPushDown =
     getConf(PARQUET_FILTER_PUSHDOWN_ENABLED, "false").toBoolean
 
+  /** When true uses Parquet implementation based on data source API */
+  private[spark] def parquetUseDataSourceApi=
+    getConf(PARQUET_USE_DATA_SOURCE_API, "true").toBoolean
+
   /** When true the planner will use the external sort, which may spill to disk. */
   private[spark] def externalSortEnabled: Boolean = getConf(EXTERNAL_SORT, "false").toBoolean
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 01620aa0acd4..1f88efa5df24 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -304,7 +304,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group userf
    */
   def parquetFile(path: String): DataFrame =
-    DataFrame(this, parquet.ParquetRelation(path, Some(sparkContext.hadoopConfiguration), this))
+    baseRelationToDataFrame(parquet.ParquetRelation2(path, Map("path" -> path))(this))
 
   /**
    * Loads a JSON file (one object per line), returning the result as a [[DataFrame]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 25a659fe9ccb..f47a7889cb60 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -122,6 +122,9 @@ case class ParquetRelation2
 
   private val PartitionSpec(partitionColumns, partitions) = {
     val partitionDirPaths = dataFiles
+      // When reading a single raw Parquet part-file, base path points to that single data file
+      // rather than its parent directory, shouldn't use it for partition discovery.
+      .filterNot(_.getPath == qualifiedBasePath)
       .map(f => fs.makeQualified(f.getPath.getParent))
       .filterNot(_ == qualifiedBasePath)
       .distinct
@@ -129,10 +132,8 @@ case class ParquetRelation2
     if (partitionDirPaths.nonEmpty) {
       ParquetRelation2.parsePartitions(qualifiedBasePath, partitionDirPaths)
     } else {
-      // No partition directories found, makes a pseudo single-partition specification
-      PartitionSpec(
-        StructType(Seq.empty[StructField]),
-        Seq(Partition(EmptyRow, qualifiedBasePath.toString)))
+      // No partition directories found, makes an empty specification
+      PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[Partition])
     }
   }
 
@@ -207,9 +208,7 @@ case class ParquetRelation2
   // TODO Should calculate per scan size
   // It's common that a query only scans a fraction of a large Parquet file.  Returning size of the
   // whole Parquet file disables some optimizations in this case (e.g. broadcast join).
-  override val sizeInBytes = partitions.map { part =>
-    dataFiles.find(_.getPath.getParent.toString == part.path).get.getLen
-  }.sum
+  override val sizeInBytes = dataFiles.map(_.getLen).sum
 
   private val dataSchema = readSchema()
 
@@ -247,8 +246,10 @@ case class ParquetRelation2
       partitions
     }
 
-    val selectedFiles = selectedPartitions.flatMap { p =>
-      dataFiles.filter(_.getPath.getParent.toString == p.path)
+    val selectedFiles = if (isPartitioned) {
+      selectedPartitions.flatMap(p => dataFiles.filter(_.getPath.getParent.toString == p.path))
+    } else {
+      dataFiles
     }
 
     // FileInputFormat cannot handle empty lists.
@@ -265,8 +266,10 @@ case class ParquetRelation2
       .filter(_ => sqlContext.conf.parquetFilterPushDown)
       .foreach(ParquetInputFormat.setFilterPredicate(jobConf, _))
 
-    def percentRead = selectedPartitions.size.toDouble / partitions.size.toDouble * 100
-    logInfo(s"Reading $percentRead% of $path partitions")
+    if (isPartitioned) {
+      def percentRead = selectedPartitions.size.toDouble / partitions.size.toDouble * 100
+      logInfo(s"Reading $percentRead% of $path partitions")
+    }
 
     val requiredColumns = output.map(_.name)
     val requestedSchema = StructType(requiredColumns.map(schema(_)))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
index ff91a0eb4204..d5363164ebcb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -22,8 +22,10 @@ import parquet.filter2.predicate.{FilterPredicate, Operators}
 
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal, Predicate, Row}
-import org.apache.spark.sql.types._
+import org.apache.spark.sql.catalyst.planning.PhysicalOperation
+import org.apache.spark.sql.sources.LogicalRelation
 import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Column, DataFrame, QueryTest, SQLConf}
 
 /**
@@ -54,9 +56,17 @@ class ParquetFilterSuite extends QueryTest with ParquetTest {
         .select(output.map(e => Column(e)): _*)
         .where(Column(predicate))
 
-      val maybeAnalyzedPredicate = query.queryExecution.executedPlan.collect {
-        case plan: ParquetTableScan => plan.columnPruningPred
-      }.flatten.reduceOption(_ && _)
+      val maybeAnalyzedPredicate = {
+        val forParquetTableScan = query.queryExecution.executedPlan.collect {
+          case plan: ParquetTableScan => plan.columnPruningPred
+        }.flatten.reduceOption(_ && _)
+
+        val forParquetDataSource = query.queryExecution.optimizedPlan.collect {
+          case PhysicalOperation(_, filters, LogicalRelation(_: ParquetRelation2)) => filters
+        }.flatten.reduceOption(_ && _)
+
+        forParquetTableScan.orElse(forParquetDataSource)
+      }
 
       assert(maybeAnalyzedPredicate.isDefined)
       maybeAnalyzedPredicate.foreach { pred =>
@@ -86,35 +96,38 @@ class ParquetFilterSuite extends QueryTest with ParquetTest {
 
   test("filter pushdown - boolean") {
     withParquetRDD((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit rdd =>
-      checkFilterPredicate('_1.isNull,    classOf[Eq   [_]], Seq.empty[Row])
+      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], Seq(Row(true), Row(false)))
 
-      checkFilterPredicate('_1 === true, classOf[Eq   [_]], true)
+      checkFilterPredicate('_1 === true, classOf[Eq[_]], true)
       checkFilterPredicate('_1 !== true, classOf[NotEq[_]], false)
     }
   }
 
   test("filter pushdown - short") {
     withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toShort)))) { implicit rdd =>
-      checkFilterPredicate(Cast('_1, IntegerType) === 1, classOf[Eq   [_]], 1)
-      checkFilterPredicate(Cast('_1, IntegerType) !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
-      
-      checkFilterPredicate(Cast('_1, IntegerType) < 2,  classOf[Lt  [_]], 1)
-      checkFilterPredicate(Cast('_1, IntegerType) > 3,  classOf[Gt  [_]], 4)
+      checkFilterPredicate(Cast('_1, IntegerType) === 1, classOf[Eq[_]], 1)
+      checkFilterPredicate(
+        Cast('_1, IntegerType) !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+
+      checkFilterPredicate(Cast('_1, IntegerType) < 2, classOf[Lt[_]], 1)
+      checkFilterPredicate(Cast('_1, IntegerType) > 3, classOf[Gt[_]], 4)
       checkFilterPredicate(Cast('_1, IntegerType) <= 1, classOf[LtEq[_]], 1)
       checkFilterPredicate(Cast('_1, IntegerType) >= 4, classOf[GtEq[_]], 4)
-      
-      checkFilterPredicate(Literal(1) === Cast('_1, IntegerType), classOf[Eq  [_]], 1)
-      checkFilterPredicate(Literal(2) >   Cast('_1, IntegerType), classOf[Lt  [_]], 1)
-      checkFilterPredicate(Literal(3) <   Cast('_1, IntegerType), classOf[Gt  [_]], 4)
-      checkFilterPredicate(Literal(1) >=  Cast('_1, IntegerType), classOf[LtEq[_]], 1)
-      checkFilterPredicate(Literal(4) <=  Cast('_1, IntegerType), classOf[GtEq[_]], 4)
-      
+
+      checkFilterPredicate(Literal(1) === Cast('_1, IntegerType), classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(2) > Cast('_1, IntegerType), classOf[Lt[_]], 1)
+      checkFilterPredicate(Literal(3) < Cast('_1, IntegerType), classOf[Gt[_]], 4)
+      checkFilterPredicate(Literal(1) >= Cast('_1, IntegerType), classOf[LtEq[_]], 1)
+      checkFilterPredicate(Literal(4) <= Cast('_1, IntegerType), classOf[GtEq[_]], 4)
+
       checkFilterPredicate(!(Cast('_1, IntegerType) < 4), classOf[GtEq[_]], 4)
-      checkFilterPredicate(Cast('_1, IntegerType) > 2 && Cast('_1, IntegerType) < 4, 
-        classOf[Operators.And], 3)
-      checkFilterPredicate(Cast('_1, IntegerType) < 2 || Cast('_1, IntegerType) > 3, 
-        classOf[Operators.Or],  Seq(Row(1), Row(4)))
+      checkFilterPredicate(
+        Cast('_1, IntegerType) > 2 && Cast('_1, IntegerType) < 4, classOf[Operators.And], 3)
+      checkFilterPredicate(
+        Cast('_1, IntegerType) < 2 || Cast('_1, IntegerType) > 3,
+        classOf[Operators.Or],
+        Seq(Row(1), Row(4)))
     }
   }
 
@@ -131,15 +144,15 @@ class ParquetFilterSuite extends QueryTest with ParquetTest {
       checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
       checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
-      checkFilterPredicate(Literal(1) === '_1, classOf[Eq  [_]], 1)
-      checkFilterPredicate(Literal(2) > '_1, classOf[Lt  [_]], 1)
-      checkFilterPredicate(Literal(3) < '_1, classOf[Gt  [_]], 4)
+      checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
+      checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
       checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
       checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
       checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
       checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
-      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or],  Seq(Row(1), Row(4)))
+      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
     }
   }
 
@@ -151,20 +164,20 @@ class ParquetFilterSuite extends QueryTest with ParquetTest {
       checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
       checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
-      checkFilterPredicate('_1 <  2, classOf[Lt[_]], 1)
-      checkFilterPredicate('_1 >  3, classOf[Gt[_]], 4)
+      checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
+      checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
       checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
       checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
       checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
-      checkFilterPredicate(Literal(2) >   '_1, classOf[Lt[_]], 1)
-      checkFilterPredicate(Literal(3) <   '_1, classOf[Gt[_]], 4)
-      checkFilterPredicate(Literal(1) >=  '_1, classOf[LtEq[_]], 1)
-      checkFilterPredicate(Literal(4) <=  '_1, classOf[GtEq[_]], 4)
+      checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
+      checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
+      checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
+      checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
       checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
       checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
-      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or],  Seq(Row(1), Row(4)))
+      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
     }
   }
 
@@ -176,8 +189,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest {
       checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
       checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
-      checkFilterPredicate('_1 <  2, classOf[Lt[_]], 1)
-      checkFilterPredicate('_1 >  3, classOf[Gt[_]], 4)
+      checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
+      checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
       checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
       checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
@@ -189,7 +202,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest {
 
       checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
       checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
-      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or],  Seq(Row(1), Row(4)))
+      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
     }
   }
 
@@ -201,20 +214,20 @@ class ParquetFilterSuite extends QueryTest with ParquetTest {
       checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
       checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
-      checkFilterPredicate('_1 <  2, classOf[Lt[_]], 1)
-      checkFilterPredicate('_1 >  3, classOf[Gt[_]], 4)
+      checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
+      checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
       checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
       checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
-      checkFilterPredicate(Literal(1) === '_1, classOf[Eq  [_]], 1)
-      checkFilterPredicate(Literal(2) > '_1, classOf[Lt  [_]], 1)
-      checkFilterPredicate(Literal(3) < '_1, classOf[Gt  [_]], 4)
+      checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+      checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
+      checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
       checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
       checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
       checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
       checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
-      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or],  Seq(Row(1), Row(4)))
+      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
     }
   }
 
@@ -227,8 +240,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest {
       checkFilterPredicate('_1 === "1", classOf[Eq[_]], "1")
       checkFilterPredicate('_1 !== "1", classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.toString)))
 
-      checkFilterPredicate('_1 <  "2", classOf[Lt[_]], "1")
-      checkFilterPredicate('_1 >  "3", classOf[Gt[_]], "4")
+      checkFilterPredicate('_1 < "2", classOf[Lt[_]], "1")
+      checkFilterPredicate('_1 > "3", classOf[Gt[_]], "4")
       checkFilterPredicate('_1 <= "1", classOf[LtEq[_]], "1")
       checkFilterPredicate('_1 >= "4", classOf[GtEq[_]], "4")
 
@@ -268,11 +281,12 @@ class ParquetFilterSuite extends QueryTest with ParquetTest {
     }
 
     withParquetRDD((1 to 4).map(i => Tuple1(i.b))) { implicit rdd =>
+      checkBinaryFilterPredicate('_1 === 1.b, classOf[Eq[_]], 1.b)
+
       checkBinaryFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkBinaryFilterPredicate(
         '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.b)).toSeq)
 
-      checkBinaryFilterPredicate('_1 === 1.b, classOf[Eq   [_]], 1.b)
       checkBinaryFilterPredicate(
         '_1 !== 1.b, classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.b)).toSeq)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index 5ec7a156d935..4c9c86fee9d4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -34,7 +34,7 @@ class ParquetQuerySuite extends QueryTest with ParquetTest {
     }
   }
 
-  test("appending") {
+  ignore("appending") {
     val data = (0 until 10).map(i => (i, i.toString))
     withParquetTable(data, "t") {
       sql("INSERT INTO TABLE t SELECT * FROM t")

From 7d0f7a22b49e9685d22542d51223246d61e2b21b Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Sun, 1 Feb 2015 20:34:09 -0800
Subject: [PATCH 03/14] Fixes Metastore Parquet table conversion

---
 .../org/apache/spark/sql/SQLContext.scala     |   5 +-
 .../apache/spark/sql/parquet/newParquet.scala | 377 ++++++++++++------
 .../apache/spark/sql/hive/HiveContext.scala   |   2 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  19 +-
 .../spark/sql/hive/HiveStrategies.scala       |   2 +-
 .../spark/sql/parquet/parquetSuites.scala     |  14 +-
 6 files changed, 288 insertions(+), 131 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 1f88efa5df24..ae94bab3aec1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -303,8 +303,9 @@ class SQLContext(@transient val sparkContext: SparkContext)
    *
    * @group userf
    */
-  def parquetFile(path: String): DataFrame =
-    baseRelationToDataFrame(parquet.ParquetRelation2(path, Map("path" -> path))(this))
+  @scala.annotation.varargs
+  def parquetFile(paths: String*): DataFrame =
+    baseRelationToDataFrame(parquet.ParquetRelation2(paths, Map.empty)(this))
 
   /**
    * Loads a JSON file (one object per line), returning the result as a [[DataFrame]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index f47a7889cb60..b1050b2fbc15 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -18,10 +18,10 @@ package org.apache.spark.sql.parquet
 
 import java.lang.{Double => JDouble, Float => JFloat, Long => JLong}
 import java.math.{BigDecimal => JBigDecimal}
-import java.net.URI
 import java.util.{List => JList}
 
 import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configuration
@@ -36,7 +36,7 @@ import parquet.hadoop.util.ContextUtil
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.rdd.{NewHadoopPartition, RDD}
+import org.apache.spark.rdd.{NewHadoopPartition, NewHadoopRDD, RDD}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.parquet.ParquetTypesConverter._
 import org.apache.spark.sql.sources._
@@ -51,7 +51,7 @@ import org.apache.spark.{Logging, Partition => SparkPartition, SparkException}
  * required is `path`, which should be the location of a collection of, optionally partitioned,
  * parquet files.
  */
-class DefaultSource extends RelationProvider {
+class DefaultSource extends RelationProvider with SchemaRelationProvider {
   /** Returns a new base relation with the given parameters. */
   override def createRelation(
       sqlContext: SQLContext,
@@ -59,7 +59,17 @@ class DefaultSource extends RelationProvider {
     val path = parameters.getOrElse("path",
       sys.error("'path' must be specified for parquet tables."))
 
-    ParquetRelation2(path, parameters)(sqlContext)
+    ParquetRelation2(Seq(path), parameters, None)(sqlContext)
+  }
+
+  override def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String],
+      schema: StructType): BaseRelation = {
+    val path = parameters.getOrElse("path",
+      sys.error("'path' must be specified for parquet tables."))
+
+    ParquetRelation2(Seq(path), parameters, Some(schema))(sqlContext)
   }
 }
 
@@ -92,64 +102,97 @@ private[parquet] case class PartitionSpec(partitionColumns: StructType, partitio
  */
 @DeveloperApi
 case class ParquetRelation2
-    (path: String, parameters: Map[String, String])
+    (paths: Seq[String], parameters: Map[String, String], maybeSchema: Option[StructType] = None)
     (@transient val sqlContext: SQLContext)
   extends CatalystScan with Logging {
 
   // Should we merge schemas from all Parquet part-files?
   private val shouldMergeSchemas =
-    parameters.getOrElse("parquet.schema.merge", "true").toBoolean
+    parameters.getOrElse(ParquetRelation2.MERGE_SCHEMA, "true").toBoolean
 
   def sparkContext = sqlContext.sparkContext
 
-  private val fs = FileSystem.get(new URI(path), sparkContext.hadoopConfiguration)
+  private val fs = FileSystem.get(sparkContext.hadoopConfiguration)
 
-  private val qualifiedBasePath = fs.makeQualified(new Path(path))
+  private val baseStatuses = {
+    val statuses = paths.distinct.map(p => fs.getFileStatus(fs.makeQualified(new Path(p))))
+    assert(statuses.forall(_.isFile) || statuses.forall(_.isDir))
+    statuses
+  }
 
-  // Cache `FileStatus` objects for Parquet data files, "_metadata", and "_common_metadata".
-  private val (dataFiles, metadataFile, commonMetadataFile) = {
-    val leaves = SparkHadoopUtil.get.listLeafStatuses(fs, qualifiedBasePath).filter { f =>
+  private val leafStatuses = baseStatuses.flatMap { f =>
+    val statuses = SparkHadoopUtil.get.listLeafStatuses(fs, f.getPath).filter { f =>
       isSummaryFile(f.getPath) ||
-        (!f.getPath.getName.startsWith("_") && !f.getPath.getName.startsWith("."))
+        !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
     }
+    assert(statuses.nonEmpty, s"${f.getPath} is an empty folder.")
+    statuses
+  }
 
-    assert(leaves.nonEmpty, s"$qualifiedBasePath is either an empty folder or nonexistent.")
+  private val (dataStatuses, metadataStatuses, commonMetadataStatuses) = {
+    (leafStatuses.filterNot(f => isSummaryFile(f.getPath)).toSeq,
+      leafStatuses.filter(f => f.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE),
+      leafStatuses.filter(f => f.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE))
+  }
 
-    (leaves.filterNot(f => isSummaryFile(f.getPath)),
-      leaves.find(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE),
-      leaves.find(_.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE))
+  private val footers = {
+    // TODO Issue a Spark job to gather footers if there are too many files
+    (dataStatuses ++ metadataStatuses ++ commonMetadataStatuses).par.map { f =>
+      val parquetMetadata = ParquetFileReader.readFooter(
+        sparkContext.hadoopConfiguration, f, ParquetMetadataConverter.NO_FILTER)
+      f -> new Footer(f.getPath, parquetMetadata)
+    }.seq.toMap
   }
 
-  private val PartitionSpec(partitionColumns, partitions) = {
-    val partitionDirPaths = dataFiles
-      // When reading a single raw Parquet part-file, base path points to that single data file
-      // rather than its parent directory, shouldn't use it for partition discovery.
-      .filterNot(_.getPath == qualifiedBasePath)
-      .map(f => fs.makeQualified(f.getPath.getParent))
-      .filterNot(_ == qualifiedBasePath)
-      .distinct
-
-    if (partitionDirPaths.nonEmpty) {
-      ParquetRelation2.parsePartitions(qualifiedBasePath, partitionDirPaths)
+  private val partitionSpec = {
+    val partitionDirs =
+      dataStatuses
+        .filterNot(baseStatuses.contains)
+        .map(_.getPath.getParent)
+        .distinct
+
+    // Hive uses this as part of the default partition name when the partition column value is null
+    // or empty string
+    val defaultPartitionName = parameters.getOrElse(
+      ParquetRelation2.DEFAULT_PARTITION_NAME,
+      "__HIVE_DEFAULT_PARTITION__")
+
+    if (partitionDirs.nonEmpty) {
+      ParquetRelation2.parsePartitions(partitionDirs, defaultPartitionName)
     } else {
       // No partition directories found, makes an empty specification
       PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[Partition])
     }
   }
 
+  private val PartitionSpec(partitionColumns, partitions) = partitionSpec
+
   private def isPartitioned = partitionColumns.nonEmpty
 
-  private val footers = {
-    // TODO Issue a Spark job to gather footers if there are too many files
-    (dataFiles ++ metadataFile ++ commonMetadataFile).par.map { f =>
-      val parquetMetadata = ParquetFileReader.readFooter(
-        sparkContext.hadoopConfiguration, f, ParquetMetadataConverter.NO_FILTER)
-      f -> new Footer(f.getPath, parquetMetadata)
-    }.seq.toMap
+  private val dataSchema = maybeSchema.getOrElse(readSchema())
+
+  private val dataSchemaIncludesPartitionKeys =
+    isPartitioned && partitionColumns.forall(f => dataSchema.fieldNames.contains(f.name))
+
+  override val schema = {
+    val fullParquetSchema = if (dataSchemaIncludesPartitionKeys) {
+      dataSchema
+    } else {
+      StructType(dataSchema.fields ++ partitionColumns.fields)
+    }
+
+    val maybeMetastoreSchema =
+      parameters
+        .get(ParquetRelation2.METASTORE_SCHEMA)
+        .map(s => DataType.fromJson(s).asInstanceOf[StructType])
+
+    maybeMetastoreSchema
+      .map(ParquetRelation2.mergeMetastoreParquetSchema(_, fullParquetSchema))
+      .getOrElse(fullParquetSchema)
   }
 
   private def readSchema(): StructType = {
-    // Figures out which file(s) we need to touch in order to retrieve the schema.
+    // Sees which file(s) we need to touch in order to figure out the schema.
     val filesToTouch =
       // Always tries the summary files first if users don't require a merged schema.  In this case,
       // "_common_metadata" is more preferable than "_metadata" because it doesn't contain row
@@ -170,34 +213,16 @@ case class ParquetRelation2
       // we can't trust the summary files if users require a merged schema, and must touch all part-
       // files to do the merge.
       if (shouldMergeSchemas) {
-        dataFiles
+        dataStatuses.toSeq
       } else {
-        commonMetadataFile
-          .orElse(metadataFile)
+        commonMetadataStatuses.headOption
+          .orElse(metadataStatuses.headOption)
           // Summary file(s) not found, falls back to the first part-file.
-          .orElse(dataFiles.headOption).toSeq
+          .orElse(dataStatuses.headOption)
+          .toSeq
       }
 
-    filesToTouch.map { file =>
-      val metadata = footers(file).getParquetMetadata.getFileMetaData
-      val parquetSchema = metadata.getSchema
-      val maybeSparkSchema = metadata
-        .getKeyValueMetaData
-        .toMap
-        .get(RowReadSupport.SPARK_METADATA_KEY)
-        .map(DataType.fromJson(_).asInstanceOf[StructType])
-
-      maybeSparkSchema.getOrElse {
-        // Falls back to Parquet schema if Spark SQL schema is absent.
-        StructType.fromAttributes(
-          // TODO Really no need to use `Attribute` here, we only need to know the data type.
-          convertToAttributes(parquetSchema, sqlContext.conf.isParquetBinaryAsString))
-      }
-    }.reduce { (left, right) =>
-      try mergeCatalystSchemas(left, right) catch { case e: Throwable =>
-        throw new SparkException(s"Failed to merge incompatible schemas $left and $right", e)
-      }
-    }
+    ParquetRelation2.readSchema(filesToTouch.map(footers.apply), sqlContext)
   }
 
   private def isSummaryFile(file: Path): Boolean = {
@@ -208,18 +233,7 @@ case class ParquetRelation2
   // TODO Should calculate per scan size
   // It's common that a query only scans a fraction of a large Parquet file.  Returning size of the
   // whole Parquet file disables some optimizations in this case (e.g. broadcast join).
-  override val sizeInBytes = dataFiles.map(_.getLen).sum
-
-  private val dataSchema = readSchema()
-
-  private val dataSchemaIncludesPartitionKeys =
-    partitionColumns.forall(f => dataSchema.fieldNames.contains(f.name))
-
-  override val schema = if (dataSchemaIncludesPartitionKeys) {
-    dataSchema
-  } else {
-    StructType(dataSchema.fields ++ partitionColumns.fields)
-  }
+  override val sizeInBytes = dataStatuses.map(_.getLen).sum
 
   // This is mostly a hack so that we can use the existing parquet filter code.
   override def buildScan(output: Seq[Attribute], predicates: Seq[Expression]): RDD[Row] = {
@@ -227,29 +241,13 @@ case class ParquetRelation2
     ParquetInputFormat.setReadSupportClass(job, classOf[RowReadSupport])
     val jobConf: Configuration = ContextUtil.getConfiguration(job)
 
-    val partitionKeySet = partitionColumns.map(_.name).toSet
-    val partitionPruningPredicate =
-      predicates
-        .filter(_.references.map(_.name).toSet.subsetOf(partitionKeySet))
-        .reduceOption(And)
-        .getOrElse(Literal(true))
-
-    val pruningCondition = InterpretedPredicate(partitionPruningPredicate transform {
-      case a: AttributeReference =>
-        val idx = partitionColumns.indexWhere(a.name == _.name)
-        BoundReference(idx, partitionColumns(idx).dataType, nullable = true)
-    })
-
-    val selectedPartitions = if (isPartitioned && predicates.nonEmpty) {
-      partitions.filter(p => pruningCondition(p.values))
-    } else {
-      partitions
-    }
-
+    val selectedPartitions = prunePartitions(predicates, partitions)
     val selectedFiles = if (isPartitioned) {
-      selectedPartitions.flatMap(p => dataFiles.filter(_.getPath.getParent.toString == p.path))
+      selectedPartitions.flatMap { p =>
+        dataStatuses.filter(_.getPath.getParent.toString == p.path)
+      }
     } else {
-      dataFiles
+      dataStatuses.toSeq
     }
 
     // FileInputFormat cannot handle empty lists.
@@ -268,7 +266,7 @@ case class ParquetRelation2
 
     if (isPartitioned) {
       def percentRead = selectedPartitions.size.toDouble / partitions.size.toDouble * 100
-      logInfo(s"Reading $percentRead% of $path partitions")
+      logInfo(s"Reading $percentRead% of partitions")
     }
 
     val requiredColumns = output.map(_.name)
@@ -287,7 +285,7 @@ case class ParquetRelation2
     jobConf.set(SQLConf.PARQUET_CACHE_METADATA, useCache.toString)
 
     val baseRDD =
-      new org.apache.spark.rdd.NewHadoopRDD(
+      new NewHadoopRDD(
           sparkContext,
           classOf[FilteringParquetRowInputFormat],
           classOf[Void],
@@ -347,29 +345,134 @@ case class ParquetRelation2
       baseRDD.map(_._2)
     }
   }
+
+  private def prunePartitions(
+      predicates: Seq[Expression],
+      partitions: Seq[Partition]): Seq[Partition] = {
+    val partitionColumnNames = partitionColumns.map(_.name).toSet
+    val partitionPruningPredicates = predicates.filter {
+      _.references.map(_.name).toSet.subsetOf(partitionColumnNames)
+    }
+
+    val rawPredicate = partitionPruningPredicates.reduceOption(And).getOrElse(Literal(true))
+    val boundPredicate = InterpretedPredicate(rawPredicate transform {
+      case a: AttributeReference =>
+        val index = partitionColumns.indexWhere(a.name == _.name)
+        BoundReference(index, partitionColumns(index).dataType, nullable = true)
+    })
+
+    if (isPartitioned && partitionPruningPredicates.nonEmpty) {
+      partitions.filter(p => boundPredicate(p.values))
+    } else {
+      partitions
+    }
+  }
 }
 
 object ParquetRelation2 {
+  // Whether we should merge schemas collected from all Parquet part-files.
+  val MERGE_SCHEMA = "parquet.mergeSchema"
+
+  // Hive Metastore schema, passed in when the Parquet relation is converted from Metastore
+  val METASTORE_SCHEMA = "parquet.metastoreSchema"
+
+  // Default partition name to use when the partition column value is null or empty string
+  val DEFAULT_PARTITION_NAME = "partition.defaultName"
+
+  private[parquet] def readSchema(footers: Seq[Footer], sqlContext: SQLContext): StructType = {
+    footers.map { footer =>
+      val metadata = footer.getParquetMetadata.getFileMetaData
+      val parquetSchema = metadata.getSchema
+      val maybeSparkSchema = metadata
+        .getKeyValueMetaData
+        .toMap
+        .get(RowReadSupport.SPARK_METADATA_KEY)
+        .map(DataType.fromJson(_).asInstanceOf[StructType])
+
+      maybeSparkSchema.getOrElse {
+        // Falls back to Parquet schema if Spark SQL schema is absent.
+        StructType.fromAttributes(
+          // TODO Really no need to use `Attribute` here, we only need to know the data type.
+          convertToAttributes(parquetSchema, sqlContext.conf.isParquetBinaryAsString))
+      }
+    }.reduce { (left, right) =>
+      try mergeCatalystSchemas(left, right) catch { case e: Throwable =>
+        throw new SparkException(s"Failed to merge incompatible schemas $left and $right", e)
+      }
+    }
+  }
+
+  private[parquet] def mergeMetastoreParquetSchema(
+      metastoreSchema: StructType,
+      parquetSchema: StructType): StructType = {
+    def schemaConflictMessage =
+      s"""Converting Hive Metastore Parquet, but detected conflicting schemas. Metastore schema:
+         |${metastoreSchema.prettyJson}
+         |
+         |Parquet schema:
+         |${parquetSchema.prettyJson}
+       """.stripMargin
+
+    assert(metastoreSchema.size == parquetSchema.size, schemaConflictMessage)
+
+    val ordinalMap = metastoreSchema.zipWithIndex.map {
+      case (field, index) => field.name.toLowerCase -> index
+    }.toMap
+    val reorderedParquetSchema = parquetSchema.sortBy(f => ordinalMap(f.name.toLowerCase))
+
+    StructType(metastoreSchema.zip(reorderedParquetSchema).map {
+      // Uses Parquet field names but retains Metastore data types.
+      case (mSchema, pSchema) if mSchema.name.toLowerCase == pSchema.name.toLowerCase =>
+        mSchema.copy(name = pSchema.name)
+      case _ =>
+        throw new SparkException(schemaConflictMessage)
+    })
+  }
+
   // TODO Data source implementations shouldn't touch Catalyst types (`Literal`).
   // However, we are already using Catalyst expressions for partition pruning and predicate
   // push-down here...
-  case class PartitionDesc(columnNames: Seq[String], literals: Seq[Literal]) {
+  private[parquet] case class PartitionValues(columnNames: Seq[String], literals: Seq[Literal]) {
     require(columnNames.size == literals.size)
   }
 
   /**
-   * Given a base path and all data file paths in it, returns a partition specification.
+   * Given a group of qualified paths, tries to parse them and returns a partition specification.
+   * For example, given:
+   * {{{
+   *   hdfs://<host>:<port>/path/to/partition/a=1/b=hello/c=3.14
+   *   hdfs://<host>:<port>/path/to/partition/a=2/b=world/c=6.28
+   * }}}
+   * it returns:
+   * {{{
+   *   PartitionSpec(
+   *     partitionColumns = StructType(
+   *       StructField(name = "a", dataType = IntegerType, nullable = true),
+   *       StructField(name = "b", dataType = StringType, nullable = true),
+   *       StructField(name = "c", dataType = DoubleType, nullable = true)),
+   *     partitions = Seq(
+   *       Partition(
+   *         values = Row(1, "hello", 3.14),
+   *         path = "hdfs://<host>:<port>/path/to/partition/a=1/b=hello/c=3.14"),
+   *       Partition(
+   *         values = Row(2, "world", 6.28),
+   *         path = "hdfs://<host>:<port>/path/to/partition/a=2/b=world/c=6.28")))
+   * }}}
    */
-  private[parquet] def parsePartitions(basePath: Path, dataPaths: Seq[Path]): PartitionSpec = {
-    val partitionDescs = resolvePartitions(dataPaths.map(parsePartition(basePath, _)))
-    val PartitionDesc(columnNames, columnLiterals) = partitionDescs.head
-    val fields = columnNames.zip(columnLiterals).map { case (name, Literal(_, dataType)) =>
-      StructField(name, dataType, nullable = true)
+  private[parquet] def parsePartitions(
+      paths: Seq[Path],
+      defaultPartitionName: String): PartitionSpec = {
+    val partitionValues = resolvePartitions(paths.map(parsePartition(_, defaultPartitionName)))
+    val fields = {
+      val (PartitionValues(columnNames, literals)) = partitionValues.head
+      columnNames.zip(literals).map { case (name, Literal(_, dataType)) =>
+        StructField(name, dataType, nullable = true)
+      }
     }
 
-    val partitions = (partitionDescs, dataPaths).zipped.map { (desc, path) =>
-      val values = desc.literals.map(_.value)
-      Partition(Row(values: _*), path.toString)
+    val partitions = partitionValues.zip(paths).map {
+      case (PartitionValues(_, literals), path) =>
+        Partition(Row(literals.map(_.value): _*), path.toString)
     }
 
     PartitionSpec(StructType(fields), partitions)
@@ -379,12 +482,12 @@ object ParquetRelation2 {
    * Parses a single partition, returns column names and values of each partition column.  For
    * example, given:
    * {{{
-   *   basePath = hdfs://host:9000/base/path/
-   *   dataPath = hdfs://host:9000/base/path/a=42/b=hello/c=3.14
+   *   basePath = hdfs://<host>:<port>/path/to/partition
+   *   partitionPath = hdfs://<host>:<port>/path/to/partition/a=42/b=hello/c=3.14
    * }}}
-   * we have:
+   * it returns:
    * {{{
-   *   PartitionSpec(
+   *   PartitionDesc(
    *     Seq("a", "b", "c"),
    *     Seq(
    *       Literal(42, IntegerType),
@@ -392,27 +495,49 @@ object ParquetRelation2 {
    *       Literal(3.14, FloatType)))
    * }}}
    */
-  private[parquet] def parsePartition(basePath: Path, dataPath: Path): PartitionDesc = {
-    val rawSpec = dataPath.toString.stripPrefix(basePath.toString).stripPrefix(Path.SEPARATOR)
-    val (columnNames, values) = rawSpec.split(Path.SEPARATOR).map { column =>
-      val equalSignIndex = column.indexOf('=')
-      assert(equalSignIndex > 0, s"Invalid partition column spec '$column' found in $dataPath")
-      val columnName = rawSpec.take(equalSignIndex)
-      val literal = inferPartitionColumnValue(rawSpec.drop(equalSignIndex + 1))
-      columnName -> literal
-    }.unzip
-
-    PartitionDesc(columnNames, values)
+  private[parquet] def parsePartition(
+      path: Path,
+      defaultPartitionName: String): PartitionValues = {
+    val columns = ArrayBuffer.empty[(String, Literal)]
+    var finished = path.isRoot
+    var chopped = path
+
+    while (!finished) {
+      val maybeColumn = parsePartitionColumn(chopped.getName, defaultPartitionName)
+      maybeColumn.foreach(columns += _)
+      chopped = chopped.getParent
+      finished = maybeColumn.isEmpty || chopped.isRoot
+    }
+
+    val (columnNames, values) = columns.unzip
+    PartitionValues(columnNames, values)
+  }
+
+  private def parsePartitionColumn(
+      columnSpec: String,
+      defaultPartitionName: String): Option[(String, Literal)] = {
+    val equalSignIndex = columnSpec.indexOf('=')
+    if (equalSignIndex == -1) {
+      None
+    } else {
+      val columnName = columnSpec.take(equalSignIndex)
+      val literal = inferPartitionColumnValue(
+        columnSpec.drop(equalSignIndex + 1), defaultPartitionName)
+      Some(columnName -> literal)
+    }
   }
 
   /**
    * Resolves possible type conflicts between partitions by up-casting "lower" types.  The up-
    * casting order is:
    * {{{
-   *   IntegerType -> LongType -> FloatType -> DoubleType -> DecimalType.Unlimited -> StringType
+   *   NullType ->
+   *   IntegerType -> LongType ->
+   *   FloatType -> DoubleType -> DecimalType.Unlimited ->
+   *   StringType
    * }}}
    */
-  private[parquet] def resolvePartitions(descs: Seq[PartitionDesc]): Seq[PartitionDesc] = {
+  private[parquet] def resolvePartitions(descs: Seq[PartitionValues]): Seq[PartitionValues] = {
     val distinctColNamesOfPartitions = descs.map(_.columnNames).distinct
     val columnCount = descs.head.columnNames.size
 
@@ -438,7 +563,9 @@ object ParquetRelation2 {
    * [[IntegerType]], [[LongType]], [[FloatType]], [[DoubleType]], [[DecimalType.Unlimited]], and
    * [[StringType]].
    */
-  private[parquet] def inferPartitionColumnValue(raw: String): Literal = {
+  private[parquet] def inferPartitionColumnValue(
+      raw: String,
+      defaultPartitionName: String): Literal = {
     // First tries integral types
     Try(Literal(Integer.parseInt(raw), IntegerType))
       .orElse(Try(Literal(JLong.parseLong(raw), LongType)))
@@ -447,11 +574,13 @@ object ParquetRelation2 {
       .orElse(Try(Literal(JDouble.parseDouble(raw), DoubleType)))
       .orElse(Try(Literal(new JBigDecimal(raw), DecimalType.Unlimited)))
       // Then falls back to string
-      .getOrElse(Literal(raw, StringType))
+      .getOrElse {
+        if (raw == defaultPartitionName) Literal(null, NullType) else Literal(raw, StringType)
+      }
   }
 
   private val upCastingOrder: Seq[DataType] =
-    Seq(IntegerType, LongType, FloatType, DoubleType, DecimalType.Unlimited, StringType)
+    Seq(NullType, IntegerType, LongType, FloatType, DoubleType, DecimalType.Unlimited, StringType)
 
   /**
    * Given a collection of [[Literal]]s, resolves possible type conflicts by up-casting "lower"
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index d2371d4a5519..e25daf399889 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -404,7 +404,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
       TakeOrdered,
       ParquetOperations,
       InMemoryScans,
-      ParquetConversion, // Must be before HiveTableScans
+      // ParquetConversion, // Must be before HiveTableScans
       HiveTableScans,
       DataSinks,
       Scripts,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 243310686d08..e5c02c7d3416 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.parquet.ParquetRelation2
 import org.apache.spark.sql.sources.{DDLParser, LogicalRelation, ResolvedDataSource}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -175,10 +176,24 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
           Nil
         }
 
-      // Since HiveQL is case insensitive for table names we make them all lowercase.
-      MetastoreRelation(
+      val relation = MetastoreRelation(
         databaseName, tblName, alias)(
           table.getTTable, partitions.map(part => part.getTPartition))(hive)
+
+      if (hive.convertMetastoreParquet &&
+          relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet")) {
+        val metastoreSchema = StructType.fromAttributes(relation.output)
+        val paths = if (relation.hiveQlTable.isPartitioned) {
+          relation.hiveQlPartitions.map(p => p.getLocation)
+        } else {
+          Seq(relation.hiveQlTable.getDataLocation.toString)
+        }
+
+        LogicalRelation(ParquetRelation2(
+          paths, Map(ParquetRelation2.METASTORE_SCHEMA -> metastoreSchema.json))(hive))
+      } else {
+        relation
+      }
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 7857a0252ebb..b2cd1ad6097a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -137,7 +137,7 @@ private[hive] trait HiveStrategies {
             }
 
             hiveContext
-              .parquetFile(partitions.map(_.getLocation).mkString(","))
+              .parquetFile(partitions.map(_.getLocation): _*)
               .addPartitioningAttributes(relation.partitionKeys)
               .lowerCase
               .where(unresolvedOtherPredicates)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
index 30441bbbdf81..40370359c722 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.expressions.Row
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.execution.PhysicalRDD
 import org.apache.spark.sql.hive.execution.HiveTableScan
 import org.apache.spark.sql.hive.test.TestHive._
 
@@ -79,7 +80,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
        STORED AS
        INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
        OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
-      location '${new File(partitionedTableDir, "p=1").getCanonicalPath}'
+      location '${new File(normalTableDir, "normal").getCanonicalPath}'
     """)
 
     (1 to 10).foreach { p =>
@@ -105,6 +106,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
     assert(
       sql("SELECT * FROM normal_parquet").queryExecution.executedPlan.collect {
         case _: ParquetTableScan => true
+        case _: PhysicalRDD => true
       }.nonEmpty)
   }
 }
@@ -147,6 +149,7 @@ class ParquetSourceSuite extends ParquetPartitioningTest {
  */
 abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll {
   var partitionedTableDir: File = null
+  var normalTableDir: File = null
   var partitionedTableDirWithKey: File = null
 
   import org.apache.spark.sql.hive.test.TestHive.implicits._
@@ -156,6 +159,10 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
     partitionedTableDir.delete()
     partitionedTableDir.mkdir()
 
+    normalTableDir = File.createTempFile("parquettests", "sparksql")
+    normalTableDir.delete()
+    normalTableDir.mkdir()
+
     (1 to 10).foreach { p =>
       val partDir = new File(partitionedTableDir, s"p=$p")
       sparkContext.makeRDD(1 to 10)
@@ -163,6 +170,11 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
         .saveAsParquetFile(partDir.getCanonicalPath)
     }
 
+    sparkContext
+      .makeRDD(1 to 10)
+      .map(i => ParquetData(i, s"part-1"))
+      .saveAsParquetFile(new File(normalTableDir, "normal").getCanonicalPath)
+
     partitionedTableDirWithKey = File.createTempFile("parquettests", "sparksql")
     partitionedTableDirWithKey.delete()
     partitionedTableDirWithKey.mkdir()

From 596c3120c03427939c8dcbc9971903a26780d0f3 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 2 Feb 2015 03:30:01 -0800
Subject: [PATCH 04/14] Uses switch to control whether use Parquet data source
 or not

---
 .../scala/org/apache/spark/sql/SQLConf.scala  |   2 +-
 .../org/apache/spark/sql/SQLContext.scala     |  17 +-
 .../spark/sql/execution/SparkStrategies.scala |   9 +-
 .../spark/sql/parquet/ParquetTest.scala       |   2 +-
 .../sql/parquet/ParquetFilterSuite.scala      | 354 +++++++++---------
 .../spark/sql/parquet/ParquetIOSuite.scala    | 348 ++++++++---------
 .../spark/sql/parquet/ParquetQuerySuite.scala | 127 ++++---
 .../apache/spark/sql/hive/HiveContext.scala   |   2 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |   1 +
 .../spark/sql/hive/HiveStrategies.scala       |   3 +-
 .../spark/sql/parquet/HiveParquetSuite.scala  |  78 ++--
 .../spark/sql/parquet/parquetSuites.scala     | 188 +++++-----
 12 files changed, 589 insertions(+), 542 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 0c7624a5d7d8..5ef3bb022fc5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -107,7 +107,7 @@ private[sql] class SQLConf extends Serializable {
     getConf(PARQUET_FILTER_PUSHDOWN_ENABLED, "false").toBoolean
 
   /** When true uses Parquet implementation based on data source API */
-  private[spark] def parquetUseDataSourceApi=
+  private[spark] def parquetUseDataSourceApi =
     getConf(PARQUET_USE_DATA_SOURCE_API, "true").toBoolean
 
   /** When true the planner will use the external sort, which may spill to disk. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index ae94bab3aec1..df43d51c6ae9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -20,14 +20,13 @@ package org.apache.spark.sql
 import java.beans.Introspector
 import java.util.Properties
 
-import scala.collection.immutable
 import scala.collection.JavaConversions._
+import scala.collection.immutable
 import scala.language.implicitConversions
 import scala.reflect.runtime.universe.TypeTag
 
-import org.apache.spark.{SparkContext, Partition}
 import org.apache.spark.annotation.{AlphaComponent, DeveloperApi, Experimental}
-import org.apache.spark.api.java.{JavaSparkContext, JavaRDD}
+import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis._
@@ -36,11 +35,12 @@ import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.json._
 import org.apache.spark.sql.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation}
-import org.apache.spark.sql.sources._
+import org.apache.spark.sql.json._
+import org.apache.spark.sql.sources.{BaseRelation, DDLParser, DataSourceStrategy, LogicalRelation, _}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
+import org.apache.spark.{Partition, SparkContext}
 
 /**
  * :: AlphaComponent ::
@@ -305,7 +305,12 @@ class SQLContext(@transient val sparkContext: SparkContext)
    */
   @scala.annotation.varargs
   def parquetFile(paths: String*): DataFrame =
-    baseRelationToDataFrame(parquet.ParquetRelation2(paths, Map.empty)(this))
+    if (conf.parquetUseDataSourceApi) {
+      baseRelationToDataFrame(parquet.ParquetRelation2(paths, Map.empty)(this))
+    } else {
+      new DataFrame(this, parquet.ParquetRelation(
+        paths.mkString(","), Some(sparkContext.hadoopConfiguration), this))
+    }
 
   /**
    * Loads a JSON file (one object per line), returning the result as a [[DataFrame]].
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index f06f5fd1fc51..81bcf5a6f32d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -17,18 +17,17 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.{SQLContext, Strategy, execution}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.columnar.{InMemoryColumnarTableScan, InMemoryRelation}
+import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand}
 import org.apache.spark.sql.parquet._
+import org.apache.spark.sql.sources.{CreateTableUsing, CreateTempTableUsing, DescribeCommand => LogicalDescribeCommand, _}
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.sources.{DescribeCommand => LogicalDescribeCommand}
-import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand}
-import org.apache.spark.sql.sources._
+import org.apache.spark.sql.{SQLContext, Strategy, execution}
 
 private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   self: SQLContext#SparkPlanner =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index 8d3e094e3344..ee0558c22183 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -90,7 +90,7 @@ trait ParquetTest {
       (f: String => Unit): Unit = {
     import sqlContext.implicits._
     withTempPath { file =>
-      sparkContext.parallelize(data).saveAsParquetFile(file.getCanonicalPath)
+      sparkContext.parallelize(data, 3).saveAsParquetFile(file.getCanonicalPath)
       f(file.getCanonicalPath)
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
index d5363164ebcb..e0fd8aa5f917 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -94,217 +94,227 @@ class ParquetFilterSuite extends QueryTest with ParquetTest {
     checkFilterPredicate(predicate, filterClass, Seq(Row(expected)))(rdd)
   }
 
-  test("filter pushdown - boolean") {
-    withParquetRDD((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit rdd =>
-      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], Seq(Row(true), Row(false)))
-
-      checkFilterPredicate('_1 === true, classOf[Eq[_]], true)
-      checkFilterPredicate('_1 !== true, classOf[NotEq[_]], false)
+  private def checkBinaryFilterPredicate
+      (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Seq[Row])
+      (implicit rdd: DataFrame): Unit = {
+    def checkBinaryAnswer(rdd: DataFrame, expected: Seq[Row]) = {
+      assertResult(expected.map(_.getAs[Array[Byte]](0).mkString(",")).toSeq.sorted) {
+        rdd.map(_.getAs[Array[Byte]](0).mkString(",")).collect().toSeq.sorted
+      }
     }
-  }
 
-  test("filter pushdown - short") {
-    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toShort)))) { implicit rdd =>
-      checkFilterPredicate(Cast('_1, IntegerType) === 1, classOf[Eq[_]], 1)
-      checkFilterPredicate(
-        Cast('_1, IntegerType) !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
-
-      checkFilterPredicate(Cast('_1, IntegerType) < 2, classOf[Lt[_]], 1)
-      checkFilterPredicate(Cast('_1, IntegerType) > 3, classOf[Gt[_]], 4)
-      checkFilterPredicate(Cast('_1, IntegerType) <= 1, classOf[LtEq[_]], 1)
-      checkFilterPredicate(Cast('_1, IntegerType) >= 4, classOf[GtEq[_]], 4)
-
-      checkFilterPredicate(Literal(1) === Cast('_1, IntegerType), classOf[Eq[_]], 1)
-      checkFilterPredicate(Literal(2) > Cast('_1, IntegerType), classOf[Lt[_]], 1)
-      checkFilterPredicate(Literal(3) < Cast('_1, IntegerType), classOf[Gt[_]], 4)
-      checkFilterPredicate(Literal(1) >= Cast('_1, IntegerType), classOf[LtEq[_]], 1)
-      checkFilterPredicate(Literal(4) <= Cast('_1, IntegerType), classOf[GtEq[_]], 4)
-
-      checkFilterPredicate(!(Cast('_1, IntegerType) < 4), classOf[GtEq[_]], 4)
-      checkFilterPredicate(
-        Cast('_1, IntegerType) > 2 && Cast('_1, IntegerType) < 4, classOf[Operators.And], 3)
-      checkFilterPredicate(
-        Cast('_1, IntegerType) < 2 || Cast('_1, IntegerType) > 3,
-        classOf[Operators.Or],
-        Seq(Row(1), Row(4)))
-    }
+    checkFilterPredicate(rdd, predicate, filterClass, checkBinaryAnswer _, expected)
   }
 
-  test("filter pushdown - integer") {
-    withParquetRDD((1 to 4).map(i => Tuple1(Option(i)))) { implicit rdd =>
-      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
-
-      checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
-      checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+  private def checkBinaryFilterPredicate
+      (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Array[Byte])
+      (implicit rdd: DataFrame): Unit = {
+    checkBinaryFilterPredicate(predicate, filterClass, Seq(Row(expected)))(rdd)
+  }
 
-      checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
-      checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
-      checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
-      checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
+  def run(prefix: String): Unit = {
+    test(s"$prefix: filter pushdown - boolean") {
+      withParquetRDD((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit rdd =>
+        checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+        checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], Seq(Row(true), Row(false)))
 
-      checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
-      checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
-      checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
-      checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
-      checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
+        checkFilterPredicate('_1 === true, classOf[Eq[_]], true)
+        checkFilterPredicate('_1 !== true, classOf[NotEq[_]], false)
+      }
+    }
 
-      checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
-      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
-      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
+    test(s"$prefix: filter pushdown - short") {
+      withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toShort)))) { implicit rdd =>
+        checkFilterPredicate(Cast('_1, IntegerType) === 1, classOf[Eq[_]], 1)
+        checkFilterPredicate(
+          Cast('_1, IntegerType) !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+
+        checkFilterPredicate(Cast('_1, IntegerType) < 2, classOf[Lt[_]], 1)
+        checkFilterPredicate(Cast('_1, IntegerType) > 3, classOf[Gt[_]], 4)
+        checkFilterPredicate(Cast('_1, IntegerType) <= 1, classOf[LtEq[_]], 1)
+        checkFilterPredicate(Cast('_1, IntegerType) >= 4, classOf[GtEq[_]], 4)
+
+        checkFilterPredicate(Literal(1) === Cast('_1, IntegerType), classOf[Eq[_]], 1)
+        checkFilterPredicate(Literal(2) > Cast('_1, IntegerType), classOf[Lt[_]], 1)
+        checkFilterPredicate(Literal(3) < Cast('_1, IntegerType), classOf[Gt[_]], 4)
+        checkFilterPredicate(Literal(1) >= Cast('_1, IntegerType), classOf[LtEq[_]], 1)
+        checkFilterPredicate(Literal(4) <= Cast('_1, IntegerType), classOf[GtEq[_]], 4)
+
+        checkFilterPredicate(!(Cast('_1, IntegerType) < 4), classOf[GtEq[_]], 4)
+        checkFilterPredicate(
+          Cast('_1, IntegerType) > 2 && Cast('_1, IntegerType) < 4, classOf[Operators.And], 3)
+        checkFilterPredicate(
+          Cast('_1, IntegerType) < 2 || Cast('_1, IntegerType) > 3,
+          classOf[Operators.Or],
+          Seq(Row(1), Row(4)))
+      }
     }
-  }
 
-  test("filter pushdown - long") {
-    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toLong)))) { implicit rdd =>
-      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+    test(s"$prefix: filter pushdown - integer") {
+      withParquetRDD((1 to 4).map(i => Tuple1(Option(i)))) { implicit rdd =>
+        checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+        checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
-      checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
-      checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+        checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
+        checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
-      checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
-      checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
-      checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
-      checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
+        checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
+        checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
+        checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
+        checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
-      checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
-      checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
-      checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
-      checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
-      checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
+        checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+        checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
+        checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
+        checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
+        checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
-      checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
-      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
-      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
+        checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
+        checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
+        checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
+      }
     }
-  }
 
-  test("filter pushdown - float") {
-    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { implicit rdd =>
-      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+    test(s"$prefix: filter pushdown - long") {
+      withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toLong)))) { implicit rdd =>
+        checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+        checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
-      checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
-      checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+        checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
+        checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
-      checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
-      checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
-      checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
-      checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
+        checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
+        checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
+        checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
+        checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
-      checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
-      checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
-      checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
-      checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
-      checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
+        checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+        checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
+        checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
+        checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
+        checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
-      checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
-      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
-      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
+        checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
+        checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
+        checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
+      }
     }
-  }
 
-  test("filter pushdown - double") {
-    withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { implicit rdd =>
-      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-      checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+    test(s"$prefix: filter pushdown - float") {
+      withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { implicit rdd =>
+        checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+        checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
 
-      checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
-      checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+        checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
+        checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
 
-      checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
-      checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
-      checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
-      checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
+        checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
+        checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
+        checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
+        checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
 
-      checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
-      checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
-      checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
-      checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
-      checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
+        checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+        checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
+        checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
+        checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
+        checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
 
-      checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
-      checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
-      checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
+        checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
+        checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
+        checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
+      }
     }
-  }
 
-  test("filter pushdown - string") {
-    withParquetRDD((1 to 4).map(i => Tuple1(i.toString))) { implicit rdd =>
-      checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-      checkFilterPredicate(
-        '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.toString)))
-
-      checkFilterPredicate('_1 === "1", classOf[Eq[_]], "1")
-      checkFilterPredicate('_1 !== "1", classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.toString)))
-
-      checkFilterPredicate('_1 < "2", classOf[Lt[_]], "1")
-      checkFilterPredicate('_1 > "3", classOf[Gt[_]], "4")
-      checkFilterPredicate('_1 <= "1", classOf[LtEq[_]], "1")
-      checkFilterPredicate('_1 >= "4", classOf[GtEq[_]], "4")
-
-      checkFilterPredicate(Literal("1") === '_1, classOf[Eq[_]], "1")
-      checkFilterPredicate(Literal("2") > '_1, classOf[Lt[_]], "1")
-      checkFilterPredicate(Literal("3") < '_1, classOf[Gt[_]], "4")
-      checkFilterPredicate(Literal("1") >= '_1, classOf[LtEq[_]], "1")
-      checkFilterPredicate(Literal("4") <= '_1, classOf[GtEq[_]], "4")
-
-      checkFilterPredicate(!('_1 < "4"), classOf[GtEq[_]], "4")
-      checkFilterPredicate('_1 > "2" && '_1 < "4", classOf[Operators.And], "3")
-      checkFilterPredicate('_1 < "2" || '_1 > "3", classOf[Operators.Or], Seq(Row("1"), Row("4")))
+    test(s"$prefix: filter pushdown - double") {
+      withParquetRDD((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { implicit rdd =>
+        checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+        checkFilterPredicate('_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(Row.apply(_)))
+
+        checkFilterPredicate('_1 === 1, classOf[Eq[_]], 1)
+        checkFilterPredicate('_1 !== 1, classOf[NotEq[_]], (2 to 4).map(Row.apply(_)))
+
+        checkFilterPredicate('_1 < 2, classOf[Lt[_]], 1)
+        checkFilterPredicate('_1 > 3, classOf[Gt[_]], 4)
+        checkFilterPredicate('_1 <= 1, classOf[LtEq[_]], 1)
+        checkFilterPredicate('_1 >= 4, classOf[GtEq[_]], 4)
+
+        checkFilterPredicate(Literal(1) === '_1, classOf[Eq[_]], 1)
+        checkFilterPredicate(Literal(2) > '_1, classOf[Lt[_]], 1)
+        checkFilterPredicate(Literal(3) < '_1, classOf[Gt[_]], 4)
+        checkFilterPredicate(Literal(1) >= '_1, classOf[LtEq[_]], 1)
+        checkFilterPredicate(Literal(4) <= '_1, classOf[GtEq[_]], 4)
+
+        checkFilterPredicate(!('_1 < 4), classOf[GtEq[_]], 4)
+        checkFilterPredicate('_1 > 2 && '_1 < 4, classOf[Operators.And], 3)
+        checkFilterPredicate('_1 < 2 || '_1 > 3, classOf[Operators.Or], Seq(Row(1), Row(4)))
+      }
     }
-  }
 
-  def checkBinaryFilterPredicate
-      (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Seq[Row])
-      (implicit rdd: DataFrame): Unit = {
-    def checkBinaryAnswer(rdd: DataFrame, expected: Seq[Row]) = {
-      assertResult(expected.map(_.getAs[Array[Byte]](0).mkString(",")).toSeq.sorted) {
-        rdd.map(_.getAs[Array[Byte]](0).mkString(",")).collect().toSeq.sorted
+    test(s"$prefix: filter pushdown - string") {
+      withParquetRDD((1 to 4).map(i => Tuple1(i.toString))) { implicit rdd =>
+        checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+        checkFilterPredicate(
+          '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.toString)))
+
+        checkFilterPredicate('_1 === "1", classOf[Eq[_]], "1")
+        checkFilterPredicate('_1 !== "1", classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.toString)))
+
+        checkFilterPredicate('_1 < "2", classOf[Lt[_]], "1")
+        checkFilterPredicate('_1 > "3", classOf[Gt[_]], "4")
+        checkFilterPredicate('_1 <= "1", classOf[LtEq[_]], "1")
+        checkFilterPredicate('_1 >= "4", classOf[GtEq[_]], "4")
+
+        checkFilterPredicate(Literal("1") === '_1, classOf[Eq[_]], "1")
+        checkFilterPredicate(Literal("2") > '_1, classOf[Lt[_]], "1")
+        checkFilterPredicate(Literal("3") < '_1, classOf[Gt[_]], "4")
+        checkFilterPredicate(Literal("1") >= '_1, classOf[LtEq[_]], "1")
+        checkFilterPredicate(Literal("4") <= '_1, classOf[GtEq[_]], "4")
+
+        checkFilterPredicate(!('_1 < "4"), classOf[GtEq[_]], "4")
+        checkFilterPredicate('_1 > "2" && '_1 < "4", classOf[Operators.And], "3")
+        checkFilterPredicate('_1 < "2" || '_1 > "3", classOf[Operators.Or], Seq(Row("1"), Row("4")))
       }
     }
 
-    checkFilterPredicate(rdd, predicate, filterClass, checkBinaryAnswer _, expected)
-  }
+    test(s"$prefix: filter pushdown - binary") {
+      implicit class IntToBinary(int: Int) {
+        def b: Array[Byte] = int.toString.getBytes("UTF-8")
+      }
 
-  def checkBinaryFilterPredicate
-      (predicate: Predicate, filterClass: Class[_ <: FilterPredicate], expected: Array[Byte])
-      (implicit rdd: DataFrame): Unit = {
-    checkBinaryFilterPredicate(predicate, filterClass, Seq(Row(expected)))(rdd)
-  }
+      withParquetRDD((1 to 4).map(i => Tuple1(i.b))) { implicit rdd =>
+        checkBinaryFilterPredicate('_1 === 1.b, classOf[Eq[_]], 1.b)
 
-  test("filter pushdown - binary") {
-    implicit class IntToBinary(int: Int) {
-      def b: Array[Byte] = int.toString.getBytes("UTF-8")
-    }
+        checkBinaryFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
+        checkBinaryFilterPredicate(
+          '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.b)).toSeq)
 
-    withParquetRDD((1 to 4).map(i => Tuple1(i.b))) { implicit rdd =>
-      checkBinaryFilterPredicate('_1 === 1.b, classOf[Eq[_]], 1.b)
+        checkBinaryFilterPredicate(
+          '_1 !== 1.b, classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.b)).toSeq)
 
-      checkBinaryFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
-      checkBinaryFilterPredicate(
-        '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.b)).toSeq)
+        checkBinaryFilterPredicate('_1 < 2.b, classOf[Lt[_]], 1.b)
+        checkBinaryFilterPredicate('_1 > 3.b, classOf[Gt[_]], 4.b)
+        checkBinaryFilterPredicate('_1 <= 1.b, classOf[LtEq[_]], 1.b)
+        checkBinaryFilterPredicate('_1 >= 4.b, classOf[GtEq[_]], 4.b)
 
-      checkBinaryFilterPredicate(
-        '_1 !== 1.b, classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.b)).toSeq)
+        checkBinaryFilterPredicate(Literal(1.b) === '_1, classOf[Eq[_]], 1.b)
+        checkBinaryFilterPredicate(Literal(2.b) > '_1, classOf[Lt[_]], 1.b)
+        checkBinaryFilterPredicate(Literal(3.b) < '_1, classOf[Gt[_]], 4.b)
+        checkBinaryFilterPredicate(Literal(1.b) >= '_1, classOf[LtEq[_]], 1.b)
+        checkBinaryFilterPredicate(Literal(4.b) <= '_1, classOf[GtEq[_]], 4.b)
 
-      checkBinaryFilterPredicate('_1 < 2.b, classOf[Lt[_]], 1.b)
-      checkBinaryFilterPredicate('_1 > 3.b, classOf[Gt[_]], 4.b)
-      checkBinaryFilterPredicate('_1 <= 1.b, classOf[LtEq[_]], 1.b)
-      checkBinaryFilterPredicate('_1 >= 4.b, classOf[GtEq[_]], 4.b)
+        checkBinaryFilterPredicate(!('_1 < 4.b), classOf[GtEq[_]], 4.b)
+        checkBinaryFilterPredicate('_1 > 2.b && '_1 < 4.b, classOf[Operators.And], 3.b)
+        checkBinaryFilterPredicate(
+          '_1 < 2.b || '_1 > 3.b, classOf[Operators.Or], Seq(Row(1.b), Row(4.b)))
+      }
+    }
+  }
 
-      checkBinaryFilterPredicate(Literal(1.b) === '_1, classOf[Eq[_]], 1.b)
-      checkBinaryFilterPredicate(Literal(2.b) > '_1, classOf[Lt[_]], 1.b)
-      checkBinaryFilterPredicate(Literal(3.b) < '_1, classOf[Gt[_]], 4.b)
-      checkBinaryFilterPredicate(Literal(1.b) >= '_1, classOf[LtEq[_]], 1.b)
-      checkBinaryFilterPredicate(Literal(4.b) <= '_1, classOf[GtEq[_]], 4.b)
+  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
+    run("Enable Parquet data source")
+  }
 
-      checkBinaryFilterPredicate(!('_1 < 4.b), classOf[GtEq[_]], 4.b)
-      checkBinaryFilterPredicate('_1 > 2.b && '_1 < 4.b, classOf[Operators.And], 3.b)
-      checkBinaryFilterPredicate(
-        '_1 < 2.b || '_1 > 3.b, classOf[Operators.Or], Seq(Row(1.b), Row(4.b)))
-    }
+  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "false") {
+    run("Disable Parquet data source")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index 0bc246c64560..757eb61bac7c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -73,218 +73,228 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
     withParquetRDD(data)(r => checkAnswer(r, data.map(Row.fromTuple)))
   }
 
-  test("basic data types (without binary)") {
-    val data = (1 to 4).map { i =>
-      (i % 2 == 0, i, i.toLong, i.toFloat, i.toDouble)
+  def run(prefix: String): Unit = {
+    test(s"$prefix: basic data types (without binary)") {
+      val data = (1 to 4).map { i =>
+        (i % 2 == 0, i, i.toLong, i.toFloat, i.toDouble)
+      }
+      checkParquetFile(data)
     }
-    checkParquetFile(data)
-  }
 
-  test("raw binary") {
-    val data = (1 to 4).map(i => Tuple1(Array.fill(3)(i.toByte)))
-    withParquetRDD(data) { rdd =>
-      assertResult(data.map(_._1.mkString(",")).sorted) {
-        rdd.collect().map(_.getAs[Array[Byte]](0).mkString(",")).sorted
+    test(s"$prefix: raw binary") {
+      val data = (1 to 4).map(i => Tuple1(Array.fill(3)(i.toByte)))
+      withParquetRDD(data) { rdd =>
+        assertResult(data.map(_._1.mkString(",")).sorted) {
+          rdd.collect().map(_.getAs[Array[Byte]](0).mkString(",")).sorted
+        }
       }
     }
-  }
 
-  test("string") {
-    val data = (1 to 4).map(i => Tuple1(i.toString))
-    // Property spark.sql.parquet.binaryAsString shouldn't affect Parquet files written by Spark SQL
-    // as we store Spark SQL schema in the extra metadata.
-    withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING -> "false")(checkParquetFile(data))
-    withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING -> "true")(checkParquetFile(data))
-  }
+    test(s"$prefix: string") {
+      val data = (1 to 4).map(i => Tuple1(i.toString))
+      // Property spark.sql.parquet.binaryAsString shouldn't affect Parquet files written by Spark SQL
+      // as we store Spark SQL schema in the extra metadata.
+      withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING -> "false")(checkParquetFile(data))
+      withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING -> "true")(checkParquetFile(data))
+    }
 
-  test("fixed-length decimals") {
-    import org.apache.spark.sql.test.TestSQLContext.implicits._
+    test(s"$prefix: fixed-length decimals") {
+      import org.apache.spark.sql.test.TestSQLContext.implicits._
 
-    def makeDecimalRDD(decimal: DecimalType): DataFrame =
-      sparkContext
-        .parallelize(0 to 1000)
-        .map(i => Tuple1(i / 100.0))
-        .select($"_1" cast decimal as "abcd")
+      def makeDecimalRDD(decimal: DecimalType): DataFrame =
+        sparkContext
+          .parallelize(0 to 1000)
+          .map(i => Tuple1(i / 100.0))
+          .select($"_1" cast decimal as "abcd")
 
-    for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17))) {
-      withTempPath { dir =>
-        val data = makeDecimalRDD(DecimalType(precision, scale))
-        data.saveAsParquetFile(dir.getCanonicalPath)
-        checkAnswer(parquetFile(dir.getCanonicalPath), data.collect().toSeq)
+      for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17))) {
+        withTempPath { dir =>
+          val data = makeDecimalRDD(DecimalType(precision, scale))
+          data.saveAsParquetFile(dir.getCanonicalPath)
+          checkAnswer(parquetFile(dir.getCanonicalPath), data.collect().toSeq)
+        }
       }
-    }
 
-    // Decimals with precision above 18 are not yet supported
-    intercept[RuntimeException] {
-      withTempPath { dir =>
-        makeDecimalRDD(DecimalType(19, 10)).saveAsParquetFile(dir.getCanonicalPath)
-        parquetFile(dir.getCanonicalPath).collect()
+      // Decimals with precision above 18 are not yet supported
+      intercept[RuntimeException] {
+        withTempPath { dir =>
+          makeDecimalRDD(DecimalType(19, 10)).saveAsParquetFile(dir.getCanonicalPath)
+          parquetFile(dir.getCanonicalPath).collect()
+        }
       }
-    }
 
-    // Unlimited-length decimals are not yet supported
-    intercept[RuntimeException] {
-      withTempPath { dir =>
-        makeDecimalRDD(DecimalType.Unlimited).saveAsParquetFile(dir.getCanonicalPath)
-        parquetFile(dir.getCanonicalPath).collect()
+      // Unlimited-length decimals are not yet supported
+      intercept[RuntimeException] {
+        withTempPath { dir =>
+          makeDecimalRDD(DecimalType.Unlimited).saveAsParquetFile(dir.getCanonicalPath)
+          parquetFile(dir.getCanonicalPath).collect()
+        }
       }
     }
-  }
 
-  test("map") {
-    val data = (1 to 4).map(i => Tuple1(Map(i -> s"val_$i")))
-    checkParquetFile(data)
-  }
+    test(s"$prefix: map") {
+      val data = (1 to 4).map(i => Tuple1(Map(i -> s"val_$i")))
+      checkParquetFile(data)
+    }
 
-  test("array") {
-    val data = (1 to 4).map(i => Tuple1(Seq(i, i + 1)))
-    checkParquetFile(data)
-  }
+    test(s"$prefix: array") {
+      val data = (1 to 4).map(i => Tuple1(Seq(i, i + 1)))
+      checkParquetFile(data)
+    }
 
-  test("struct") {
-    val data = (1 to 4).map(i => Tuple1((i, s"val_$i")))
-    withParquetRDD(data) { rdd =>
-      // Structs are converted to `Row`s
-      checkAnswer(rdd, data.map { case Tuple1(struct) =>
-        Row(Row(struct.productIterator.toSeq: _*))
-      })
+    test(s"$prefix: struct") {
+      val data = (1 to 4).map(i => Tuple1((i, s"val_$i")))
+      withParquetRDD(data) { rdd =>
+        // Structs are converted to `Row`s
+        checkAnswer(rdd, data.map { case Tuple1(struct) =>
+          Row(Row(struct.productIterator.toSeq: _*))
+        })
+      }
     }
-  }
 
-  test("nested struct with array of array as field") {
-    val data = (1 to 4).map(i => Tuple1((i, Seq(Seq(s"val_$i")))))
-    withParquetRDD(data) { rdd =>
-      // Structs are converted to `Row`s
-      checkAnswer(rdd, data.map { case Tuple1(struct) =>
-        Row(Row(struct.productIterator.toSeq: _*))
-      })
+    test(s"$prefix: nested struct with array of array as field") {
+      val data = (1 to 4).map(i => Tuple1((i, Seq(Seq(s"val_$i")))))
+      withParquetRDD(data) { rdd =>
+        // Structs are converted to `Row`s
+        checkAnswer(rdd, data.map { case Tuple1(struct) =>
+          Row(Row(struct.productIterator.toSeq: _*))
+        })
+      }
     }
-  }
 
-  test("nested map with struct as value type") {
-    val data = (1 to 4).map(i => Tuple1(Map(i -> (i, s"val_$i"))))
-    withParquetRDD(data) { rdd =>
-      checkAnswer(rdd, data.map { case Tuple1(m) =>
-        Row(m.mapValues(struct => Row(struct.productIterator.toSeq: _*)))
-      })
+    test(s"$prefix: nested map with struct as value type") {
+      val data = (1 to 4).map(i => Tuple1(Map(i -> (i, s"val_$i"))))
+      withParquetRDD(data) { rdd =>
+        checkAnswer(rdd, data.map { case Tuple1(m) =>
+          Row(m.mapValues(struct => Row(struct.productIterator.toSeq: _*)))
+        })
+      }
     }
-  }
 
-  test("nulls") {
-    val allNulls = (
-      null.asInstanceOf[java.lang.Boolean],
-      null.asInstanceOf[Integer],
-      null.asInstanceOf[java.lang.Long],
-      null.asInstanceOf[java.lang.Float],
-      null.asInstanceOf[java.lang.Double])
-
-    withParquetRDD(allNulls :: Nil) { rdd =>
-      val rows = rdd.collect()
-      assert(rows.size === 1)
-      assert(rows.head === Row(Seq.fill(5)(null): _*))
+    test(s"$prefix: nulls") {
+      val allNulls = (
+        null.asInstanceOf[java.lang.Boolean],
+        null.asInstanceOf[Integer],
+        null.asInstanceOf[java.lang.Long],
+        null.asInstanceOf[java.lang.Float],
+        null.asInstanceOf[java.lang.Double])
+
+      withParquetRDD(allNulls :: Nil) { rdd =>
+        val rows = rdd.collect()
+        assert(rows.size === 1)
+        assert(rows.head === Row(Seq.fill(5)(null): _*))
+      }
     }
-  }
 
-  test("nones") {
-    val allNones = (
-      None.asInstanceOf[Option[Int]],
-      None.asInstanceOf[Option[Long]],
-      None.asInstanceOf[Option[String]])
+    test(s"$prefix: nones") {
+      val allNones = (
+        None.asInstanceOf[Option[Int]],
+        None.asInstanceOf[Option[Long]],
+        None.asInstanceOf[Option[String]])
 
-    withParquetRDD(allNones :: Nil) { rdd =>
-      val rows = rdd.collect()
-      assert(rows.size === 1)
-      assert(rows.head === Row(Seq.fill(3)(null): _*))
+      withParquetRDD(allNones :: Nil) { rdd =>
+        val rows = rdd.collect()
+        assert(rows.size === 1)
+        assert(rows.head === Row(Seq.fill(3)(null): _*))
+      }
     }
-  }
 
-  test("compression codec") {
-    def compressionCodecFor(path: String) = {
-      val codecs = ParquetTypesConverter
-        .readMetaData(new Path(path), Some(configuration))
-        .getBlocks
-        .flatMap(_.getColumns)
-        .map(_.getCodec.name())
-        .distinct
-
-      assert(codecs.size === 1)
-      codecs.head
-    }
+    test(s"$prefix: compression codec") {
+      def compressionCodecFor(path: String) = {
+        val codecs = ParquetTypesConverter
+          .readMetaData(new Path(path), Some(configuration))
+          .getBlocks
+          .flatMap(_.getColumns)
+          .map(_.getCodec.name())
+          .distinct
+
+        assert(codecs.size === 1)
+        codecs.head
+      }
 
-    val data = (0 until 10).map(i => (i, i.toString))
+      val data = (0 until 10).map(i => (i, i.toString))
 
-    def checkCompressionCodec(codec: CompressionCodecName): Unit = {
-      withSQLConf(SQLConf.PARQUET_COMPRESSION -> codec.name()) {
-        withParquetFile(data) { path =>
-          assertResult(conf.parquetCompressionCodec.toUpperCase) {
-            compressionCodecFor(path)
+      def checkCompressionCodec(codec: CompressionCodecName): Unit = {
+        withSQLConf(SQLConf.PARQUET_COMPRESSION -> codec.name()) {
+          withParquetFile(data) { path =>
+            assertResult(conf.parquetCompressionCodec.toUpperCase) {
+              compressionCodecFor(path)
+            }
           }
         }
       }
-    }
 
-    // Checks default compression codec
-    checkCompressionCodec(CompressionCodecName.fromConf(conf.parquetCompressionCodec))
+      // Checks default compression codec
+      checkCompressionCodec(CompressionCodecName.fromConf(conf.parquetCompressionCodec))
 
-    checkCompressionCodec(CompressionCodecName.UNCOMPRESSED)
-    checkCompressionCodec(CompressionCodecName.GZIP)
-    checkCompressionCodec(CompressionCodecName.SNAPPY)
-  }
+      checkCompressionCodec(CompressionCodecName.UNCOMPRESSED)
+      checkCompressionCodec(CompressionCodecName.GZIP)
+      checkCompressionCodec(CompressionCodecName.SNAPPY)
+    }
 
-  test("read raw Parquet file") {
-    def makeRawParquetFile(path: Path): Unit = {
-      val schema = MessageTypeParser.parseMessageType(
-        """
-          |message root {
-          |  required boolean _1;
-          |  required int32   _2;
-          |  required int64   _3;
-          |  required float   _4;
-          |  required double  _5;
-          |}
-        """.stripMargin)
-
-      val writeSupport = new TestGroupWriteSupport(schema)
-      val writer = new ParquetWriter[Group](path, writeSupport)
-
-      (0 until 10).foreach { i =>
-        val record = new SimpleGroup(schema)
-        record.add(0, i % 2 == 0)
-        record.add(1, i)
-        record.add(2, i.toLong)
-        record.add(3, i.toFloat)
-        record.add(4, i.toDouble)
-        writer.write(record)
-      }
+    test(s"$prefix: read raw Parquet file") {
+      def makeRawParquetFile(path: Path): Unit = {
+        val schema = MessageTypeParser.parseMessageType(
+          """
+            |message root {
+            |  required boolean _1;
+            |  required int32   _2;
+            |  required int64   _3;
+            |  required float   _4;
+            |  required double  _5;
+            |}
+          """.stripMargin)
+
+        val writeSupport = new TestGroupWriteSupport(schema)
+        val writer = new ParquetWriter[Group](path, writeSupport)
+
+        (0 until 10).foreach { i =>
+          val record = new SimpleGroup(schema)
+          record.add(0, i % 2 == 0)
+          record.add(1, i)
+          record.add(2, i.toLong)
+          record.add(3, i.toFloat)
+          record.add(4, i.toDouble)
+          writer.write(record)
+        }
 
-      writer.close()
-    }
+        writer.close()
+      }
 
-    withTempDir { dir =>
-      val path = new Path(dir.toURI.toString, "part-r-0.parquet")
-      makeRawParquetFile(path)
-      checkAnswer(parquetFile(path.toString), (0 until 10).map { i =>
-        Row(i % 2 == 0, i, i.toLong, i.toFloat, i.toDouble)
-      })
+      withTempDir { dir =>
+        val path = new Path(dir.toURI.toString, "part-r-0.parquet")
+        makeRawParquetFile(path)
+        checkAnswer(parquetFile(path.toString), (0 until 10).map { i =>
+          Row(i % 2 == 0, i, i.toLong, i.toFloat, i.toDouble)
+        })
+      }
     }
-  }
 
-  test("write metadata") {
-    withTempPath { file =>
-      val path = new Path(file.toURI.toString)
-      val fs = FileSystem.getLocal(configuration)
-      val attributes = ScalaReflection.attributesFor[(Int, String)]
-      ParquetTypesConverter.writeMetaData(attributes, path, configuration)
+    test(s"$prefix: write metadata") {
+      withTempPath { file =>
+        val path = new Path(file.toURI.toString)
+        val fs = FileSystem.getLocal(configuration)
+        val attributes = ScalaReflection.attributesFor[(Int, String)]
+        ParquetTypesConverter.writeMetaData(attributes, path, configuration)
 
-      assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)))
-      assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)))
+        assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)))
+        assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)))
 
-      val metaData = ParquetTypesConverter.readMetaData(path, Some(configuration))
-      val actualSchema = metaData.getFileMetaData.getSchema
-      val expectedSchema = ParquetTypesConverter.convertFromAttributes(attributes)
+        val metaData = ParquetTypesConverter.readMetaData(path, Some(configuration))
+        val actualSchema = metaData.getFileMetaData.getSchema
+        val expectedSchema = ParquetTypesConverter.convertFromAttributes(attributes)
 
-      actualSchema.checkContains(expectedSchema)
-      expectedSchema.checkContains(actualSchema)
+        actualSchema.checkContains(expectedSchema)
+        expectedSchema.checkContains(actualSchema)
+      }
     }
   }
+
+  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
+    run("Enable Parquet data source")
+  }
+
+  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "false") {
+    run("Disable Parquet data source")
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index 4c9c86fee9d4..6f4f8ff9c25d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql.parquet
 
-import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.test.TestSQLContext._
+import org.apache.spark.sql.{QueryTest, SQLConf}
 
 /**
  * A test suite that tests various Parquet queries.
@@ -28,82 +28,93 @@ import org.apache.spark.sql.test.TestSQLContext._
 class ParquetQuerySuite extends QueryTest with ParquetTest {
   val sqlContext = TestSQLContext
 
-  test("simple projection") {
-    withParquetTable((0 until 10).map(i => (i, i.toString)), "t") {
-      checkAnswer(sql("SELECT _1 FROM t"), (0 until 10).map(Row.apply(_)))
+  def run(prefix: String): Unit = {
+    test(s"$prefix: simple projection") {
+      withParquetTable((0 until 10).map(i => (i, i.toString)), "t") {
+        checkAnswer(sql("SELECT _1 FROM t"), (0 until 10).map(Row.apply(_)))
+      }
     }
-  }
 
-  ignore("appending") {
-    val data = (0 until 10).map(i => (i, i.toString))
-    withParquetTable(data, "t") {
-      sql("INSERT INTO TABLE t SELECT * FROM t")
-      checkAnswer(table("t"), (data ++ data).map(Row.fromTuple))
+    // TODO Re-enable this after data source insertion API is merged
+    ignore(s"$prefix: appending") {
+      val data = (0 until 10).map(i => (i, i.toString))
+      withParquetTable(data, "t") {
+        sql("INSERT INTO t SELECT * FROM t")
+        checkAnswer(table("t"), (data ++ data).map(Row.fromTuple))
+      }
     }
-  }
 
-  // This test case will trigger the NPE mentioned in
-  // https://issues.apache.org/jira/browse/PARQUET-151.
-  ignore("overwriting") {
-    val data = (0 until 10).map(i => (i, i.toString))
-    withParquetTable(data, "t") {
-      sql("INSERT OVERWRITE TABLE t SELECT * FROM t")
-      checkAnswer(table("t"), data.map(Row.fromTuple))
+    // This test case will trigger the NPE mentioned in
+    // https://issues.apache.org/jira/browse/PARQUET-151.
+    ignore("overwriting") {
+      val data = (0 until 10).map(i => (i, i.toString))
+      withParquetTable(data, "t") {
+        sql("INSERT OVERWRITE TABLE t SELECT * FROM t")
+        checkAnswer(table("t"), data.map(Row.fromTuple))
+      }
     }
-  }
 
-  test("self-join") {
-    // 4 rows, cells of column 1 of row 2 and row 4 are null
-    val data = (1 to 4).map { i =>
-      val maybeInt = if (i % 2 == 0) None else Some(i)
-      (maybeInt, i.toString)
-    }
+    test("self-join") {
+      // 4 rows, cells of column 1 of row 2 and row 4 are null
+      val data = (1 to 4).map { i =>
+        val maybeInt = if (i % 2 == 0) None else Some(i)
+        (maybeInt, i.toString)
+      }
 
-    withParquetTable(data, "t") {
-      val selfJoin = sql("SELECT * FROM t x JOIN t y WHERE x._1 = y._1")
-      val queryOutput = selfJoin.queryExecution.analyzed.output
+      withParquetTable(data, "t") {
+        val selfJoin = sql("SELECT * FROM t x JOIN t y WHERE x._1 = y._1")
+        val queryOutput = selfJoin.queryExecution.analyzed.output
 
-      assertResult(4, s"Field count mismatches")(queryOutput.size)
-      assertResult(2, s"Duplicated expression ID in query plan:\n $selfJoin") {
-        queryOutput.filter(_.name == "_1").map(_.exprId).size
-      }
+        assertResult(4, s"Field count mismatches")(queryOutput.size)
+        assertResult(2, s"Duplicated expression ID in query plan:\n $selfJoin") {
+          queryOutput.filter(_.name == "_1").map(_.exprId).size
+        }
 
-      checkAnswer(selfJoin, List(Row(1, "1", 1, "1"), Row(3, "3", 3, "3")))
+        checkAnswer(selfJoin, List(Row(1, "1", 1, "1"), Row(3, "3", 3, "3")))
+      }
     }
-  }
 
-  test("nested data - struct with array field") {
-    val data = (1 to 10).map(i => Tuple1((i, Seq(s"val_$i"))))
-    withParquetTable(data, "t") {
-      checkAnswer(sql("SELECT _1._2[0] FROM t"), data.map {
-        case Tuple1((_, Seq(string))) => Row(string)
-      })
+    test(s"$prefix: nested data - struct with array field") {
+      val data = (1 to 10).map(i => Tuple1((i, Seq(s"val_$i"))))
+      withParquetTable(data, "t") {
+        checkAnswer(sql("SELECT _1._2[0] FROM t"), data.map {
+          case Tuple1((_, Seq(string))) => Row(string)
+        })
+      }
     }
-  }
 
-  test("nested data - array of struct") {
-    val data = (1 to 10).map(i => Tuple1(Seq(i -> s"val_$i")))
-    withParquetTable(data, "t") {
-      checkAnswer(sql("SELECT _1[0]._2 FROM t"), data.map {
-        case Tuple1(Seq((_, string))) => Row(string)
-      })
+    test(s"$prefix: nested data - array of struct") {
+      val data = (1 to 10).map(i => Tuple1(Seq(i -> s"val_$i")))
+      withParquetTable(data, "t") {
+        checkAnswer(sql("SELECT _1[0]._2 FROM t"), data.map {
+          case Tuple1(Seq((_, string))) => Row(string)
+        })
+      }
     }
-  }
 
-  test("SPARK-1913 regression: columns only referenced by pushed down filters should remain") {
-    withParquetTable((1 to 10).map(Tuple1.apply), "t") {
-      checkAnswer(sql(s"SELECT _1 FROM t WHERE _1 < 10"), (1 to 9).map(Row.apply(_)))
+    test(s"$prefix: SPARK-1913 regression: columns only referenced by pushed down filters should remain") {
+      withParquetTable((1 to 10).map(Tuple1.apply), "t") {
+        checkAnswer(sql(s"SELECT _1 FROM t WHERE _1 < 10"), (1 to 9).map(Row.apply(_)))
+      }
     }
-  }
 
-  test("SPARK-5309 strings stored using dictionary compression in parquet") {
-    withParquetTable((0 until 1000).map(i => ("same", "run_" + i /100, 1)), "t") {
+    test(s"$prefix: SPARK-5309 strings stored using dictionary compression in parquet") {
+      withParquetTable((0 until 1000).map(i => ("same", "run_" + i /100, 1)), "t") {
 
-      checkAnswer(sql(s"SELECT _1, _2, SUM(_3) FROM t GROUP BY _1, _2"),
-        (0 until 10).map(i => Row("same", "run_" + i, 100)))
+        checkAnswer(sql(s"SELECT _1, _2, SUM(_3) FROM t GROUP BY _1, _2"),
+          (0 until 10).map(i => Row("same", "run_" + i, 100)))
 
-      checkAnswer(sql(s"SELECT _1, _2, SUM(_3) FROM t WHERE _2 = 'run_5' GROUP BY _1, _2"),
-        List(Row("same", "run_5", 100)))
+        checkAnswer(sql(s"SELECT _1, _2, SUM(_3) FROM t WHERE _2 = 'run_5' GROUP BY _1, _2"),
+          List(Row("same", "run_5", 100)))
+      }
     }
   }
+
+  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
+    run("Enable Parquet data source")
+  }
+
+  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "false") {
+    run("Disable Parquet data source")
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index e25daf399889..d2371d4a5519 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -404,7 +404,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
       TakeOrdered,
       ParquetOperations,
       InMemoryScans,
-      // ParquetConversion, // Must be before HiveTableScans
+      ParquetConversion, // Must be before HiveTableScans
       HiveTableScans,
       DataSinks,
       Scripts,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index e5c02c7d3416..c78369d12cf5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -181,6 +181,7 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
           table.getTTable, partitions.map(part => part.getTPartition))(hive)
 
       if (hive.convertMetastoreParquet &&
+          hive.conf.parquetUseDataSourceApi &&
           relation.tableDesc.getSerdeClassName.toLowerCase.contains("parquet")) {
         val metastoreSchema = StructType.fromAttributes(relation.output)
         val paths = if (relation.hiveQlTable.isPartitioned) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index b2cd1ad6097a..12e9bc8fa05e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -87,7 +87,8 @@ private[hive] trait HiveStrategies {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
       case PhysicalOperation(projectList, predicates, relation: MetastoreRelation)
           if relation.tableDesc.getSerdeClassName.contains("Parquet") &&
-             hiveContext.convertMetastoreParquet =>
+             hiveContext.convertMetastoreParquet &&
+             !hiveContext.conf.parquetUseDataSourceApi =>
 
         // Filter out all predicates that only deal with partition keys
         val partitionsKeys = AttributeSet(relation.partitionKeys)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
index 581f66639949..eae69af5864a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/HiveParquetSuite.scala
@@ -28,53 +28,55 @@ class HiveParquetSuite extends QueryTest with ParquetTest {
 
   import sqlContext._
 
-  test("Case insensitive attribute names") {
-    withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") {
-      val expected = (1 to 4).map(i => Row(i.toString))
-      checkAnswer(sql("SELECT upper FROM cases"), expected)
-      checkAnswer(sql("SELECT LOWER FROM cases"), expected)
+  def run(prefix: String): Unit = {
+    test(s"$prefix: Case insensitive attribute names") {
+      withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") {
+        val expected = (1 to 4).map(i => Row(i.toString))
+        checkAnswer(sql("SELECT upper FROM cases"), expected)
+        checkAnswer(sql("SELECT LOWER FROM cases"), expected)
+      }
     }
-  }
 
-  test("SELECT on Parquet table") {
-    val data = (1 to 4).map(i => (i, s"val_$i"))
-    withParquetTable(data, "t") {
-      checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple))
-    }
-  }
-
-  test("Simple column projection + filter on Parquet table") {
-    withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") {
-      checkAnswer(
-        sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"),
-        Seq(Row(true, "val_2"), Row(true, "val_4")))
+    test(s"$prefix: SELECT on Parquet table") {
+      val data = (1 to 4).map(i => (i, s"val_$i"))
+      withParquetTable(data, "t") {
+        checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple))
+      }
     }
-  }
 
-  test("Converting Hive to Parquet Table via saveAsParquetFile") {
-    withTempPath { dir =>
-      sql("SELECT * FROM src").saveAsParquetFile(dir.getCanonicalPath)
-      parquetFile(dir.getCanonicalPath).registerTempTable("p")
-      withTempTable("p") {
+    test(s"$prefix: Simple column projection + filter on Parquet table") {
+      withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") {
         checkAnswer(
-          sql("SELECT * FROM src ORDER BY key"),
-          sql("SELECT * from p ORDER BY key").collect().toSeq)
+          sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"),
+          Seq(Row(true, "val_2"), Row(true, "val_4")))
       }
     }
-  }
-
 
-  test("INSERT OVERWRITE TABLE Parquet table") {
-    withParquetTable((1 to 4).map(i => (i, s"val_$i")), "t") {
-      withTempPath { file =>
-        sql("SELECT * FROM t LIMIT 1").saveAsParquetFile(file.getCanonicalPath)
-        parquetFile(file.getCanonicalPath).registerTempTable("p")
+    test(s"$prefix: Converting Hive to Parquet Table via saveAsParquetFile") {
+      withTempPath { dir =>
+        sql("SELECT * FROM src").saveAsParquetFile(dir.getCanonicalPath)
+        parquetFile(dir.getCanonicalPath).registerTempTable("p")
         withTempTable("p") {
-          // let's do three overwrites for good measure
-          sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
-          sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
-          sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
-          checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq)
+          checkAnswer(
+            sql("SELECT * FROM src ORDER BY key"),
+            sql("SELECT * from p ORDER BY key").collect().toSeq)
+        }
+      }
+    }
+
+    // TODO Re-enable this after data source insertion API is merged
+    ignore(s"$prefix: INSERT OVERWRITE TABLE Parquet table") {
+      withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t") {
+        withTempPath { file =>
+          sql("SELECT * FROM t LIMIT 1").saveAsParquetFile(file.getCanonicalPath)
+          parquetFile(file.getCanonicalPath).registerTempTable("p")
+          withTempTable("p") {
+            // let's do three overwrites for good measure
+            sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
+            sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
+            sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
+            checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq)
+          }
         }
       }
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
index 40370359c722..afcbd2cab0ec 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
@@ -23,7 +23,7 @@ import java.io.File
 import org.apache.spark.sql.catalyst.expressions.Row
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.{SQLConf, QueryTest}
 import org.apache.spark.sql.execution.PhysicalRDD
 import org.apache.spark.sql.hive.execution.HiveTableScan
 import org.apache.spark.sql.hive.test.TestHive._
@@ -98,7 +98,7 @@ class ParquetMetastoreSuite extends ParquetPartitioningTest {
     setConf("spark.sql.hive.convertMetastoreParquet", "false")
   }
 
-  test("conversion is working") {
+  test(s"conversion is working") {
     assert(
       sql("SELECT * FROM normal_parquet").queryExecution.executedPlan.collect {
         case _: HiveTableScan => true
@@ -187,99 +187,107 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
     }
   }
 
-  Seq("partitioned_parquet", "partitioned_parquet_with_key").foreach { table =>
-    test(s"ordering of the partitioning columns $table") {
-      checkAnswer(
-        sql(s"SELECT p, stringField FROM $table WHERE p = 1"),
-        Seq.fill(10)(Row(1, "part-1"))
-      )
-
-      checkAnswer(
-        sql(s"SELECT stringField, p FROM $table WHERE p = 1"),
-        Seq.fill(10)(Row("part-1", 1))
-      )
-    }
-
-    test(s"project the partitioning column $table") {
-      checkAnswer(
-        sql(s"SELECT p, count(*) FROM $table group by p"),
-        Row(1, 10) ::
-          Row(2, 10) ::
-          Row(3, 10) ::
-          Row(4, 10) ::
-          Row(5, 10) ::
-          Row(6, 10) ::
-          Row(7, 10) ::
-          Row(8, 10) ::
-          Row(9, 10) ::
-          Row(10, 10) :: Nil
-      )
+  def run(prefix: String): Unit = {
+    Seq("partitioned_parquet", "partitioned_parquet_with_key").foreach { table =>
+      test(s"$prefix: ordering of the partitioning columns $table") {
+        checkAnswer(
+          sql(s"SELECT p, stringField FROM $table WHERE p = 1"),
+          Seq.fill(10)(Row(1, "part-1"))
+        )
+
+        checkAnswer(
+          sql(s"SELECT stringField, p FROM $table WHERE p = 1"),
+          Seq.fill(10)(Row("part-1", 1))
+        )
+      }
+
+      test(s"$prefix: project the partitioning column $table") {
+        checkAnswer(
+          sql(s"SELECT p, count(*) FROM $table group by p"),
+          Row(1, 10) ::
+            Row(2, 10) ::
+            Row(3, 10) ::
+            Row(4, 10) ::
+            Row(5, 10) ::
+            Row(6, 10) ::
+            Row(7, 10) ::
+            Row(8, 10) ::
+            Row(9, 10) ::
+            Row(10, 10) :: Nil
+        )
+      }
+
+      test(s"$prefix: project partitioning and non-partitioning columns $table") {
+        checkAnswer(
+          sql(s"SELECT stringField, p, count(intField) FROM $table GROUP BY p, stringField"),
+          Row("part-1", 1, 10) ::
+            Row("part-2", 2, 10) ::
+            Row("part-3", 3, 10) ::
+            Row("part-4", 4, 10) ::
+            Row("part-5", 5, 10) ::
+            Row("part-6", 6, 10) ::
+            Row("part-7", 7, 10) ::
+            Row("part-8", 8, 10) ::
+            Row("part-9", 9, 10) ::
+            Row("part-10", 10, 10) :: Nil
+        )
+      }
+
+      test(s"$prefix: simple count $table") {
+        checkAnswer(
+          sql(s"SELECT COUNT(*) FROM $table"),
+          Row(100))
+      }
+
+      test(s"$prefix: pruned count $table") {
+        checkAnswer(
+          sql(s"SELECT COUNT(*) FROM $table WHERE p = 1"),
+          Row(10))
+      }
+
+      test(s"$prefix: non-existant partition $table") {
+        checkAnswer(
+          sql(s"SELECT COUNT(*) FROM $table WHERE p = 1000"),
+          Row(0))
+      }
+
+      test(s"$prefix: multi-partition pruned count $table") {
+        checkAnswer(
+          sql(s"SELECT COUNT(*) FROM $table WHERE p IN (1,2,3)"),
+          Row(30))
+      }
+
+      test(s"$prefix: non-partition predicates $table") {
+        checkAnswer(
+          sql(s"SELECT COUNT(*) FROM $table WHERE intField IN (1,2,3)"),
+          Row(30))
+      }
+
+      test(s"$prefix: sum $table") {
+        checkAnswer(
+          sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND p = 1"),
+          Row(1 + 2 + 3))
+      }
+
+      test(s"$prefix: hive udfs $table") {
+        checkAnswer(
+          sql(s"SELECT concat(stringField, stringField) FROM $table"),
+          sql(s"SELECT stringField FROM $table").map {
+            case Row(s: String) => Row(s + s)
+          }.collect().toSeq)
+      }
     }
 
-    test(s"project partitioning and non-partitioning columns $table") {
+    test(s"$prefix: $prefix: non-part select(*)") {
       checkAnswer(
-        sql(s"SELECT stringField, p, count(intField) FROM $table GROUP BY p, stringField"),
-        Row("part-1", 1, 10) ::
-          Row("part-2", 2, 10) ::
-          Row("part-3", 3, 10) ::
-          Row("part-4", 4, 10) ::
-          Row("part-5", 5, 10) ::
-          Row("part-6", 6, 10) ::
-          Row("part-7", 7, 10) ::
-          Row("part-8", 8, 10) ::
-          Row("part-9", 9, 10) ::
-          Row("part-10", 10, 10) :: Nil
-      )
-    }
-
-    test(s"simple count $table") {
-      checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table"),
-        Row(100))
-    }
-
-    test(s"pruned count $table") {
-      checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE p = 1"),
+        sql("SELECT COUNT(*) FROM normal_parquet"),
         Row(10))
     }
-
-    test(s"non-existant partition $table") {
-      checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE p = 1000"),
-        Row(0))
-    }
-
-    test(s"multi-partition pruned count $table") {
-      checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE p IN (1,2,3)"),
-        Row(30))
-    }
-
-    test(s"non-partition predicates $table") {
-      checkAnswer(
-        sql(s"SELECT COUNT(*) FROM $table WHERE intField IN (1,2,3)"),
-        Row(30))
-    }
-
-    test(s"sum $table") {
-      checkAnswer(
-        sql(s"SELECT SUM(intField) FROM $table WHERE intField IN (1,2,3) AND p = 1"),
-        Row(1 + 2 + 3))
-    }
-
-    test(s"hive udfs $table") {
-      checkAnswer(
-        sql(s"SELECT concat(stringField, stringField) FROM $table"),
-        sql(s"SELECT stringField FROM $table").map {
-          case Row(s: String) => Row(s + s)
-        }.collect().toSeq)
-    }
   }
 
-  test("non-part select(*)") {
-    checkAnswer(
-      sql("SELECT COUNT(*) FROM normal_parquet"),
-      Row(10))
-  }
+  setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
+  run("Enable Parquet data source")
+
+  setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
+  run("Disable Parquet data source")
 }

From dd704fd2fdc7b610983aab8392eb1c307d13a13b Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 2 Feb 2015 13:05:46 -0800
Subject: [PATCH 05/14] Fixes Python Parquet API

---
 python/pyspark/sql.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 3ac8ea597e14..0a6d68d4c230 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -1471,7 +1471,7 @@ def registerRDDAsTable(self, rdd, tableName):
         else:
             raise ValueError("Can only register DataFrame as table")
 
-    def parquetFile(self, path):
+    def parquetFile(self, *paths):
         """Loads a Parquet file, returning the result as a L{DataFrame}.
 
         >>> import tempfile, shutil
@@ -1483,7 +1483,7 @@ def parquetFile(self, path):
         >>> sorted(df.collect()) == sorted(df2.collect())
         True
         """
-        jdf = self._ssql_ctx.parquetFile(path)
+        jdf = self._ssql_ctx.parquetFile(*path)
         return DataFrame(jdf, self)
 
     def jsonFile(self, path, schema=None, samplingRatio=1.0):

From b35c8c6d2a95da28e9a91a015b005ad6149044a1 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 2 Feb 2015 13:55:10 -0800
Subject: [PATCH 06/14] Fixes some typos and outdated comments

---
 .../org/apache/spark/sql/parquet/ParquetTest.scala   |  2 +-
 .../org/apache/spark/sql/parquet/newParquet.scala    | 12 ++++++++----
 .../org/apache/spark/sql/parquet/parquetSuites.scala |  2 +-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
index ee0558c22183..8d3e094e3344 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTest.scala
@@ -90,7 +90,7 @@ trait ParquetTest {
       (f: String => Unit): Unit = {
     import sqlContext.implicits._
     withTempPath { file =>
-      sparkContext.parallelize(data, 3).saveAsParquetFile(file.getCanonicalPath)
+      sparkContext.parallelize(data).saveAsParquetFile(file.getCanonicalPath)
       f(file.getCanonicalPath)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index b1050b2fbc15..cc449117568c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -482,12 +482,11 @@ object ParquetRelation2 {
    * Parses a single partition, returns column names and values of each partition column.  For
    * example, given:
    * {{{
-   *   basePath = hdfs://<host>:<port>/path/to/partition
-   *   partitionPath = hdfs://<host>:<port>/path/to/partition/a=42/b=hello/c=3.14
+   *   path = hdfs://<host>:<port>/path/to/partition/a=42/b=hello/c=3.14
    * }}}
    * it returns:
    * {{{
-   *   PartitionDesc(
+   *   PartitionValues(
    *     Seq("a", "b", "c"),
    *     Seq(
    *       Literal(42, IntegerType),
@@ -587,7 +586,12 @@ object ParquetRelation2 {
    * types.
    */
   private def resolveTypeConflicts(literals: Seq[Literal]): Seq[Literal] = {
-    val desiredType = literals.map(_.dataType).maxBy(upCastingOrder.indexOf(_))
+    val desiredType = {
+      val topType = literals.map(_.dataType).maxBy(upCastingOrder.indexOf(_))
+      // Falls back to string if all values of this column are null or empty string
+      if (topType == NullType) StringType else topType
+    }
+
     literals.map { case l @ Literal(_, dataType) =>
       Literal(Cast(l, desiredType).eval(), desiredType)
     }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
index afcbd2cab0ec..1045912f3a5c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
@@ -245,7 +245,7 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
           Row(10))
       }
 
-      test(s"$prefix: non-existant partition $table") {
+      test(s"$prefix: non-existent partition $table") {
         checkAnswer(
           sql(s"SELECT COUNT(*) FROM $table WHERE p = 1000"),
           Row(0))

From 0d8ec1dbbe18018b3f05fa664e8cfeb53bb1c198 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 2 Feb 2015 16:53:25 -0800
Subject: [PATCH 07/14] Adds more test cases

---
 .../apache/spark/sql/parquet/newParquet.scala |  20 +--
 .../sql/parquet/ParquetFilterSuite.scala      |   3 +-
 .../ParquetPartitionDiscoverySuite.scala      | 126 ++++++++++++++++++
 .../sql/parquet/ParquetSchemaSuite.scala      |  37 +++++
 4 files changed, 177 insertions(+), 9 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index cc449117568c..eb5d1d0f5d1f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -508,7 +508,7 @@ object ParquetRelation2 {
       finished = maybeColumn.isEmpty || chopped.isRoot
     }
 
-    val (columnNames, values) = columns.unzip
+    val (columnNames, values) = columns.reverse.unzip
     PartitionValues(columnNames, values)
   }
 
@@ -520,8 +520,12 @@ object ParquetRelation2 {
       None
     } else {
       val columnName = columnSpec.take(equalSignIndex)
-      val literal = inferPartitionColumnValue(
-        columnSpec.drop(equalSignIndex + 1), defaultPartitionName)
+      assert(columnName.nonEmpty, s"Empty partition column name in '$columnSpec'")
+
+      val rawColumnValue = columnSpec.drop(equalSignIndex + 1)
+      assert(rawColumnValue.nonEmpty, s"Empty partition column value in '$columnSpec'")
+
+      val literal = inferPartitionColumnValue(rawColumnValue, defaultPartitionName)
       Some(columnName -> literal)
     }
   }
@@ -536,9 +540,9 @@ object ParquetRelation2 {
    *   StringType
    * }}}
    */
-  private[parquet] def resolvePartitions(descs: Seq[PartitionValues]): Seq[PartitionValues] = {
-    val distinctColNamesOfPartitions = descs.map(_.columnNames).distinct
-    val columnCount = descs.head.columnNames.size
+  private[parquet] def resolvePartitions(values: Seq[PartitionValues]): Seq[PartitionValues] = {
+    val distinctColNamesOfPartitions = values.map(_.columnNames).distinct
+    val columnCount = values.head.columnNames.size
 
     // Column names of all partitions must match
     assert(distinctColNamesOfPartitions.size == 1, {
@@ -548,11 +552,11 @@ object ParquetRelation2 {
 
     // Resolves possible type conflicts for each column
     val resolvedValues = (0 until columnCount).map { i =>
-      resolveTypeConflicts(descs.map(_.literals(i)))
+      resolveTypeConflicts(values.map(_.literals(i)))
     }
 
     // Fills resolved literals back to each partition
-    descs.zipWithIndex.map { case (d, index) =>
+    values.zipWithIndex.map { case (d, index) =>
       d.copy(literals = resolvedValues.map(_(index)))
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
index e0fd8aa5f917..f60fae78c60a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -257,7 +257,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest {
           '_1.isNotNull, classOf[NotEq[_]], (1 to 4).map(i => Row.apply(i.toString)))
 
         checkFilterPredicate('_1 === "1", classOf[Eq[_]], "1")
-        checkFilterPredicate('_1 !== "1", classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.toString)))
+        checkFilterPredicate(
+          '_1 !== "1", classOf[NotEq[_]], (2 to 4).map(i => Row.apply(i.toString)))
 
         checkFilterPredicate('_1 < "2", classOf[Lt[_]], "1")
         checkFilterPredicate('_1 > "3", classOf[Gt[_]], "4")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
new file mode 100644
index 000000000000..ae606d11a8f6
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.parquet
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.hadoop.fs.Path
+import org.scalatest.FunSuite
+
+import org.apache.spark.sql.catalyst.expressions.Literal
+import org.apache.spark.sql.parquet.ParquetRelation2._
+import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{Row, SQLContext}
+
+class ParquetPartitionDiscoverySuite extends FunSuite with ParquetTest {
+  override val sqlContext: SQLContext = TestSQLContext
+
+  val defaultPartitionName = "__NULL__"
+
+  test("column type inference") {
+    def check(raw: String, literal: Literal): Unit = {
+      assert(inferPartitionColumnValue(raw, defaultPartitionName) === literal)
+    }
+
+    check("10", Literal(10, IntegerType))
+    check("1000000000000000", Literal(1000000000000000L, LongType))
+    check("1.5", Literal(1.5, FloatType))
+    check("hello", Literal("hello", StringType))
+    check(defaultPartitionName, Literal(null, NullType))
+  }
+
+  test("parse partition") {
+    def check(path: String, expected: PartitionValues): Unit = {
+      assert(expected === parsePartition(new Path(path), defaultPartitionName))
+    }
+
+    def checkThrows[T <: Throwable: Manifest](path: String, expected: String): Unit = {
+      val message = intercept[T] {
+        parsePartition(new Path(path), defaultPartitionName)
+      }.getMessage
+
+      assert(message.contains(expected))
+    }
+
+    check(
+      "file:///",
+      PartitionValues(
+        ArrayBuffer.empty[String],
+        ArrayBuffer.empty[Literal]))
+
+    check(
+      "file://path/a=10",
+      PartitionValues(
+        ArrayBuffer("a"),
+        ArrayBuffer(Literal(10, IntegerType))))
+
+    check(
+      "file://path/a=10/b=hello/c=1.5",
+      PartitionValues(
+        ArrayBuffer("a", "b", "c"),
+        ArrayBuffer(
+          Literal(10, IntegerType),
+          Literal("hello", StringType),
+          Literal(1.5, FloatType))))
+
+    check(
+      "file://path/a=10/b_hello/c=1.5",
+      PartitionValues(
+        ArrayBuffer("c"),
+        ArrayBuffer(Literal(1.5, FloatType))))
+
+    checkThrows[AssertionError]("file://path/=10", "Empty partition column name")
+    checkThrows[AssertionError]("file://path/a=", "Empty partition column value")
+  }
+
+  test("parse partitions") {
+    def check(paths: Seq[String], spec: PartitionSpec): Unit = {
+      assert(parsePartitions(paths.map(new Path(_)), defaultPartitionName) === spec)
+    }
+
+    check(Seq(
+      "hdfs://host:9000/path/a=10/b=hello"),
+      PartitionSpec(
+        StructType(Seq(
+          StructField("a", IntegerType),
+          StructField("b", StringType))),
+        Seq(Partition(Row(10, "hello"), "hdfs://host:9000/path/a=10/b=hello"))))
+
+    check(Seq(
+      "hdfs://host:9000/path/a=10/b=20",
+      "hdfs://host:9000/path/a=10.5/b=hello"),
+      PartitionSpec(
+        StructType(Seq(
+          StructField("a", FloatType),
+          StructField("b", StringType))),
+        Seq(
+          Partition(Row(10, "20"), "hdfs://host:9000/path/a=10/b=20"),
+          Partition(Row(10.5, "hello"), "hdfs://host:9000/path/a=10.5/b=hello"))))
+
+    check(Seq(
+      s"hdfs://host:9000/path/a=10/b=$defaultPartitionName",
+      s"hdfs://host:9000/path/a=10.5/b=$defaultPartitionName"),
+      PartitionSpec(
+        StructType(Seq(
+          StructField("a", FloatType),
+          StructField("b", StringType))),
+        Seq(
+          Partition(Row(10, null), s"hdfs://host:9000/path/a=10/b=$defaultPartitionName"),
+          Partition(Row(10.5, null), s"hdfs://host:9000/path/a=10.5/b=$defaultPartitionName"))))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
index 5f7f31d395cf..2e6c2d5f9ab5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala
@@ -25,6 +25,7 @@ import parquet.schema.MessageTypeParser
 
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.test.TestSQLContext
+import org.apache.spark.sql.types._
 
 class ParquetSchemaSuite extends FunSuite with ParquetTest {
   val sqlContext = TestSQLContext
@@ -192,4 +193,40 @@ class ParquetSchemaSuite extends FunSuite with ParquetTest {
       assert(a.nullable === b.nullable)
     }
   }
+
+  test("merge with metastore schema") {
+    // Field type conflict resolution
+    assertResult(
+      StructType(Seq(
+        StructField("lowerCase", StringType),
+        StructField("UPPERCase", DoubleType, nullable = false)))) {
+
+      ParquetRelation2.mergeMetastoreParquetSchema(
+        StructType(Seq(
+          StructField("lowercase", StringType),
+          StructField("uppercase", DoubleType, nullable = false))),
+
+        StructType(Seq(
+          StructField("lowerCase", BinaryType),
+          StructField("UPPERCase", IntegerType, nullable = true))))
+    }
+
+    // Conflicting field count
+    assert(intercept[Throwable] {
+      ParquetRelation2.mergeMetastoreParquetSchema(
+        StructType(Seq(
+          StructField("uppercase", DoubleType, nullable = false))),
+
+        StructType(Seq(
+          StructField("lowerCase", BinaryType),
+          StructField("UPPERCase", IntegerType, nullable = true))))
+    }.getMessage.contains("detected conflicting schemas"))
+
+    // Conflicting field names
+    intercept[Throwable] {
+      ParquetRelation2.mergeMetastoreParquetSchema(
+        StructType(Seq(StructField("lower", StringType))),
+        StructType(Seq(StructField("lowerCase", BinaryType))))
+    }
+  }
 }

From 4e0175f8bd079416b7b76b31fc80f53b53f37a57 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 2 Feb 2015 17:30:41 -0800
Subject: [PATCH 08/14] Fixes Python Parquet API, we need Py4J array to call
 varargs method

---
 python/pyspark/sql.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 0a6d68d4c230..a4cb38675462 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -1483,7 +1483,11 @@ def parquetFile(self, *paths):
         >>> sorted(df.collect()) == sorted(df2.collect())
         True
         """
-        jdf = self._ssql_ctx.parquetFile(*path)
+        gateway = self._sc._gateway
+        jpaths = gateway.new_array(gateway.jvm.java.lang.String, len(paths))
+        for i in range(0, len(paths)):
+            jpaths[i] = paths[i]
+        jdf = self._ssql_ctx.parquetFile(jpaths)
         return DataFrame(jdf, self)
 
     def jsonFile(self, path, schema=None, samplingRatio=1.0):

From adf2aae5390be42d5c1ba778efc42cb00b684eaa Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 2 Feb 2015 20:03:52 -0800
Subject: [PATCH 09/14] Fixes compilation error introduced while rebasing

---
 sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index df43d51c6ae9..9d7fdf9f7edf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -308,7 +308,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
     if (conf.parquetUseDataSourceApi) {
       baseRelationToDataFrame(parquet.ParquetRelation2(paths, Map.empty)(this))
     } else {
-      new DataFrame(this, parquet.ParquetRelation(
+      DataFrame(this, parquet.ParquetRelation(
         paths.mkString(","), Some(sparkContext.hadoopConfiguration), this))
     }
 

From 50dd8d119e4849893d6f84ab80ffb12f5740a340 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 3 Feb 2015 00:45:13 -0800
Subject: [PATCH 10/14] Addresses @rxin's comment, fixes UDT schema merging

---
 python/pyspark/sql.py                                      | 7 ++++---
 .../src/main/scala/org/apache/spark/sql/SQLContext.scala   | 4 ++--
 .../scala/org/apache/spark/sql/parquet/ParquetTypes.scala  | 3 +++
 .../scala/org/apache/spark/sql/hive/HiveStrategies.scala   | 4 +++-
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index a4cb38675462..e55f285a778c 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -1484,10 +1484,11 @@ def parquetFile(self, *paths):
         True
         """
         gateway = self._sc._gateway
-        jpaths = gateway.new_array(gateway.jvm.java.lang.String, len(paths))
-        for i in range(0, len(paths)):
+        jpath = paths[0]
+        jpaths = gateway.new_array(gateway.jvm.java.lang.String, len(paths) - 1)
+        for i in range(1, len(paths)):
             jpaths[i] = paths[i]
-        jdf = self._ssql_ctx.parquetFile(jpaths)
+        jdf = self._ssql_ctx.parquetFile(jpath, jpaths)
         return DataFrame(jdf, self)
 
     def jsonFile(self, path, schema=None, samplingRatio=1.0):
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 9d7fdf9f7edf..706ef6ad4f17 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -304,9 +304,9 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group userf
    */
   @scala.annotation.varargs
-  def parquetFile(paths: String*): DataFrame =
+  def parquetFile(path: String, paths: String*): DataFrame =
     if (conf.parquetUseDataSourceApi) {
-      baseRelationToDataFrame(parquet.ParquetRelation2(paths, Map.empty)(this))
+      baseRelationToDataFrame(parquet.ParquetRelation2(path +: paths, Map.empty)(this))
     } else {
       DataFrame(this, parquet.ParquetRelation(
         paths.mkString(","), Some(sparkContext.hadoopConfiguration), this))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
index 73e43611707c..a21c93d55778 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -567,6 +567,9 @@ private[parquet] object ParquetTypesConverter extends Logging {
             DecimalType.Fixed(rightPrecision, rightScale)) =>
         DecimalType(leftPrecision.max(rightPrecision), leftScale.max(rightScale))
 
+      case (leftUdt: UserDefinedType[_], rightUdt: UserDefinedType[_])
+        if leftUdt.userClass == rightUdt.userClass => leftUdt
+
       case (leftType, rightType) if leftType == rightType =>
         leftType
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 12e9bc8fa05e..95abc363ae76 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -137,8 +137,10 @@ private[hive] trait HiveStrategies {
               pruningCondition(inputData)
             }
 
+            val partitionLocations = partitions.map(_.getLocation)
+
             hiveContext
-              .parquetFile(partitions.map(_.getLocation): _*)
+              .parquetFile(partitionLocations.head, partitionLocations.tail: _*)
               .addPartitioningAttributes(relation.partitionKeys)
               .lowerCase
               .where(unresolvedOtherPredicates)

From 808380fd6aeda81d6950f2ddff706ff99c6abb4b Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 3 Feb 2015 12:42:44 -0800
Subject: [PATCH 11/14] Fixes issues introduced while rebasing

---
 .../scala/org/apache/spark/sql/parquet/ParquetTypes.scala | 4 ++--
 .../scala/org/apache/spark/sql/parquet/newParquet.scala   | 5 ++++-
 .../org/apache/spark/sql/parquet/ParquetFilterSuite.scala | 4 ++--
 .../org/apache/spark/sql/parquet/ParquetIOSuite.scala     | 4 ++--
 .../org/apache/spark/sql/parquet/ParquetQuerySuite.scala  | 8 ++++----
 .../org/apache/spark/sql/parquet/parquetSuites.scala      | 4 ++--
 6 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
index a21c93d55778..35bbac8cf903 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -545,10 +545,10 @@ private[parquet] object ParquetTypesConverter extends Logging {
         val newFields = ArrayBuffer.empty[StructField]
 
         leftFields.foreach {
-          case leftField @ StructField(leftName, leftType, leftNullable, leftMetadata) =>
+          case leftField @ StructField(leftName, leftType, leftNullable, _) =>
             rightFields
               .find(_.name == leftName)
-              .map { case rightField @ StructField(_, rightType, rightNullable, rightMeatadata) =>
+              .map { case rightField @ StructField(_, rightType, rightNullable, _) =>
                 leftField.copy(
                   dataType = mergeCatalystDataTypes(leftType, rightType),
                   nullable = leftNullable || rightNullable)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index eb5d1d0f5d1f..dc9cc79546f3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -393,7 +393,10 @@ object ParquetRelation2 {
         // Falls back to Parquet schema if Spark SQL schema is absent.
         StructType.fromAttributes(
           // TODO Really no need to use `Attribute` here, we only need to know the data type.
-          convertToAttributes(parquetSchema, sqlContext.conf.isParquetBinaryAsString))
+          convertToAttributes(
+            parquetSchema,
+            sqlContext.conf.isParquetBinaryAsString,
+            sqlContext.conf.isParquetINT96AsTimestamp))
       }
     }.reduce { (left, right) =>
       try mergeCatalystSchemas(left, right) catch { case e: Throwable =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
index f60fae78c60a..f8117c21773a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala
@@ -312,10 +312,10 @@ class ParquetFilterSuite extends QueryTest with ParquetTest {
   }
 
   withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
-    run("Enable Parquet data source")
+    run("Parquet data source enabled")
   }
 
   withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "false") {
-    run("Disable Parquet data source")
+    run("Parquet data source disabled")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index 757eb61bac7c..c8dff38f17ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -291,10 +291,10 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
   }
 
   withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
-    run("Enable Parquet data source")
+    run("Parquet data source enabled")
   }
 
   withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "false") {
-    run("Disable Parquet data source")
+    run("Parquet data source disabled")
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index 6f4f8ff9c25d..b392b3180151 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -46,7 +46,7 @@ class ParquetQuerySuite extends QueryTest with ParquetTest {
 
     // This test case will trigger the NPE mentioned in
     // https://issues.apache.org/jira/browse/PARQUET-151.
-    ignore("overwriting") {
+    ignore(s"$prefix: overwriting") {
       val data = (0 until 10).map(i => (i, i.toString))
       withParquetTable(data, "t") {
         sql("INSERT OVERWRITE TABLE t SELECT * FROM t")
@@ -54,7 +54,7 @@ class ParquetQuerySuite extends QueryTest with ParquetTest {
       }
     }
 
-    test("self-join") {
+    test(s"$prefix: self-join") {
       // 4 rows, cells of column 1 of row 2 and row 4 are null
       val data = (1 to 4).map { i =>
         val maybeInt = if (i % 2 == 0) None else Some(i)
@@ -111,10 +111,10 @@ class ParquetQuerySuite extends QueryTest with ParquetTest {
   }
 
   withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
-    run("Enable Parquet data source")
+    run("Parquet data source enabled")
   }
 
   withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "false") {
-    run("Disable Parquet data source")
+    run("Parquet data source disabled")
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
index 1045912f3a5c..a7479a5b9586 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/parquet/parquetSuites.scala
@@ -286,8 +286,8 @@ abstract class ParquetPartitioningTest extends QueryTest with BeforeAndAfterAll
   }
 
   setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
-  run("Enable Parquet data source")
+  run("Parquet data source enabled")
 
   setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
-  run("Disable Parquet data source")
+  run("Parquet data source disabled")
 }

From a49bd288d8c9c793936dc3db93affdc40b102506 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Tue, 3 Feb 2015 13:04:34 -0800
Subject: [PATCH 12/14] Fixes spelling typo in trait name
 "CreateableRelationProvider"

---
 .../main/scala/org/apache/spark/sql/json/JSONRelation.scala | 2 +-
 .../src/main/scala/org/apache/spark/sql/sources/ddl.scala   | 6 +++---
 .../scala/org/apache/spark/sql/sources/interfaces.scala     | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
index 8372decbf8aa..f27585d05a98 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JSONRelation.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.types.StructType
 
 
 private[sql] class DefaultSource
-  extends RelationProvider with SchemaRelationProvider with CreateableRelationProvider {
+  extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider {
 
   /** Returns a new base relation with the parameters. */
   override def createRelation(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index 2ef740b3be0b..f1adf60efaf8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -279,9 +279,9 @@ object ResolvedDataSource {
     }
 
     val relation = clazz.newInstance match {
-      case dataSource: org.apache.spark.sql.sources.CreateableRelationProvider =>
+      case dataSource: org.apache.spark.sql.sources.CreatableRelationProvider =>
         dataSource
-          .asInstanceOf[org.apache.spark.sql.sources.CreateableRelationProvider]
+          .asInstanceOf[org.apache.spark.sql.sources.CreatableRelationProvider]
           .createRelation(sqlContext, options, data)
       case _ =>
         sys.error(s"${clazz.getCanonicalName} does not allow create table as select.")
@@ -366,7 +366,7 @@ private [sql] case class CreateTempTableUsingAsSelect(
 /**
  * Builds a map in which keys are case insensitive
  */
-protected class CaseInsensitiveMap(map: Map[String, String]) extends Map[String, String] 
+protected class CaseInsensitiveMap(map: Map[String, String]) extends Map[String, String]
   with Serializable {
 
   val baseMap = map.map(kv => kv.copy(_1 = kv._1.toLowerCase))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index ad0a35b91ebc..40fc1f2aa272 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -78,7 +78,7 @@ trait SchemaRelationProvider {
 }
 
 @DeveloperApi
-trait CreateableRelationProvider {
+trait CreatableRelationProvider {
   def createRelation(
       sqlContext: SQLContext,
       parameters: Map[String, String],

From 8232e174552c5b126eb31e923d2ce6265032be1e Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 4 Feb 2015 00:02:17 -0800
Subject: [PATCH 13/14] Write support for Parquet data source

---
 .../org/apache/spark/sql/DataFrameImpl.scala  |   6 +-
 .../sql/parquet/ParquetTableSupport.scala     |   9 +-
 .../apache/spark/sql/parquet/newParquet.scala | 328 +++++++++++++-----
 .../sql/sources/DataSourceStrategy.scala      |   8 +-
 .../org/apache/spark/sql/sources/ddl.scala    |  18 +-
 .../spark/sql/parquet/ParquetIOSuite.scala    |   3 +-
 .../spark/sql/parquet/ParquetQuerySuite.scala |   4 +-
 7 files changed, 254 insertions(+), 122 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
index d6df927f9d42..58d11751353b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameImpl.scala
@@ -295,7 +295,11 @@ private[sql] class DataFrameImpl protected[sql](
   }
 
   override def saveAsParquetFile(path: String): Unit = {
-    sqlContext.executePlan(WriteToFile(path, logicalPlan)).toRdd
+    if (sqlContext.conf.parquetUseDataSourceApi) {
+      save("org.apache.spark.sql.parquet", "path" -> path)
+    } else {
+      sqlContext.executePlan(WriteToFile(path, logicalPlan)).toRdd
+    }
   }
 
   override def saveAsTable(tableName: String): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
index 14c81ae4eba4..19bfba34b8f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTableSupport.scala
@@ -159,7 +159,7 @@ private[parquet] class RowWriteSupport extends WriteSupport[Row] with Logging {
     val attributesSize = attributes.size
     if (attributesSize > record.size) {
       throw new IndexOutOfBoundsException(
-        s"Trying to write more fields than contained in row (${attributesSize}>${record.size})")
+        s"Trying to write more fields than contained in row ($attributesSize > ${record.size})")
     }
 
     var index = 0
@@ -325,7 +325,7 @@ private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
     val attributesSize = attributes.size
     if (attributesSize > record.size) {
       throw new IndexOutOfBoundsException(
-        s"Trying to write more fields than contained in row (${attributesSize}>${record.size})")
+        s"Trying to write more fields than contained in row ($attributesSize > ${record.size})")
     }
 
     var index = 0
@@ -348,10 +348,7 @@ private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
       index: Int): Unit = {
     ctype match {
       case StringType => writer.addBinary(
-        Binary.fromByteArray(
-          record(index).asInstanceOf[String].getBytes("utf-8")
-        )
-      )
+        Binary.fromByteArray(record(index).asInstanceOf[String].getBytes("utf-8")))
       case BinaryType => writer.addBinary(
         Binary.fromByteArray(record(index).asInstanceOf[Array[Byte]]))
       case IntegerType => writer.addInteger(record.getInt(index))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index dc9cc79546f3..4abe754383a9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -16,9 +16,11 @@
  */
 package org.apache.spark.sql.parquet
 
+import java.io.IOException
 import java.lang.{Double => JDouble, Float => JFloat, Long => JLong}
 import java.math.{BigDecimal => JBigDecimal}
-import java.util.{List => JList}
+import java.text.SimpleDateFormat
+import java.util.{List => JList, Date}
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
@@ -28,6 +30,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
 import org.apache.hadoop.mapreduce.{InputSplit, Job, JobContext}
 import parquet.filter2.predicate.FilterApi
 import parquet.format.converter.ParquetMetadataConverter
@@ -36,13 +39,14 @@ import parquet.hadoop.util.ContextUtil
 
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil
 import org.apache.spark.rdd.{NewHadoopPartition, NewHadoopRDD, RDD}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.parquet.ParquetTypesConverter._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType, _}
-import org.apache.spark.sql.{Row, SQLConf, SQLContext}
-import org.apache.spark.{Logging, Partition => SparkPartition, SparkException}
+import org.apache.spark.sql.{DataFrame, Row, SQLConf, SQLContext}
+import org.apache.spark.{Partition => SparkPartition, TaskContext, SerializableWritable, Logging, SparkException}
 
 
 /**
@@ -51,25 +55,43 @@ import org.apache.spark.{Logging, Partition => SparkPartition, SparkException}
  * required is `path`, which should be the location of a collection of, optionally partitioned,
  * parquet files.
  */
-class DefaultSource extends RelationProvider with SchemaRelationProvider {
+class DefaultSource
+    extends RelationProvider
+    with SchemaRelationProvider
+    with CreatableRelationProvider {
+  private def checkPath(parameters: Map[String, String]): String = {
+    parameters.getOrElse("path", sys.error("'path' must be specified for parquet tables."))
+  }
+
   /** Returns a new base relation with the given parameters. */
   override def createRelation(
       sqlContext: SQLContext,
       parameters: Map[String, String]): BaseRelation = {
-    val path = parameters.getOrElse("path",
-      sys.error("'path' must be specified for parquet tables."))
-
-    ParquetRelation2(Seq(path), parameters, None)(sqlContext)
+    ParquetRelation2(Seq(checkPath(parameters)), parameters, None)(sqlContext)
   }
 
   override def createRelation(
       sqlContext: SQLContext,
       parameters: Map[String, String],
       schema: StructType): BaseRelation = {
-    val path = parameters.getOrElse("path",
-      sys.error("'path' must be specified for parquet tables."))
+    ParquetRelation2(Seq(checkPath(parameters)), parameters, Some(schema))(sqlContext)
+  }
 
-    ParquetRelation2(Seq(path), parameters, Some(schema))(sqlContext)
+  override def createRelation(
+      sqlContext: SQLContext,
+      parameters: Map[String, String],
+      data: DataFrame): BaseRelation = {
+    val path = checkPath(parameters)
+    ParquetRelation.createEmpty(
+      path,
+      data.schema.toAttributes,
+      false,
+      sqlContext.sparkContext.hadoopConfiguration,
+      sqlContext)
+
+    val relation = createRelation(sqlContext, parameters, data.schema)
+    relation.asInstanceOf[ParquetRelation2].insert(data, true)
+    relation
   }
 }
 
@@ -104,96 +126,113 @@ private[parquet] case class PartitionSpec(partitionColumns: StructType, partitio
 case class ParquetRelation2
     (paths: Seq[String], parameters: Map[String, String], maybeSchema: Option[StructType] = None)
     (@transient val sqlContext: SQLContext)
-  extends CatalystScan with Logging {
+  extends CatalystScan
+  with InsertableRelation
+  with SparkHadoopMapReduceUtil
+  with Logging {
 
   // Should we merge schemas from all Parquet part-files?
   private val shouldMergeSchemas =
     parameters.getOrElse(ParquetRelation2.MERGE_SCHEMA, "true").toBoolean
 
-  def sparkContext = sqlContext.sparkContext
-
-  private val fs = FileSystem.get(sparkContext.hadoopConfiguration)
-
-  private val baseStatuses = {
-    val statuses = paths.distinct.map(p => fs.getFileStatus(fs.makeQualified(new Path(p))))
-    assert(statuses.forall(_.isFile) || statuses.forall(_.isDir))
-    statuses
+  // Optional Metastore schema, used when converting Hive Metastore Parquet table
+  private val maybeMetastoreSchema =
+    parameters
+      .get(ParquetRelation2.METASTORE_SCHEMA)
+      .map(s => DataType.fromJson(s).asInstanceOf[StructType])
+
+  // Hive uses this as part of the default partition name when the partition column value is null
+  // or empty string
+  private val defaultPartitionName = parameters.getOrElse(
+    ParquetRelation2.DEFAULT_PARTITION_NAME, "__HIVE_DEFAULT_PARTITION__")
+
+  override def equals(other: Any) = other match {
+    case relation: ParquetRelation2 =>
+      paths.toSet == relation.paths.toSet &&
+        maybeMetastoreSchema == relation.maybeMetastoreSchema &&
+        (shouldMergeSchemas == relation.shouldMergeSchemas || schema == relation.schema)
   }
 
-  private val leafStatuses = baseStatuses.flatMap { f =>
-    val statuses = SparkHadoopUtil.get.listLeafStatuses(fs, f.getPath).filter { f =>
-      isSummaryFile(f.getPath) ||
-        !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
-    }
-    assert(statuses.nonEmpty, s"${f.getPath} is an empty folder.")
-    statuses
-  }
+  private[sql] def sparkContext = sqlContext.sparkContext
 
-  private val (dataStatuses, metadataStatuses, commonMetadataStatuses) = {
-    (leafStatuses.filterNot(f => isSummaryFile(f.getPath)).toSeq,
-      leafStatuses.filter(f => f.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE),
-      leafStatuses.filter(f => f.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE))
-  }
+  @transient private val fs = FileSystem.get(sparkContext.hadoopConfiguration)
 
-  private val footers = {
-    // TODO Issue a Spark job to gather footers if there are too many files
-    (dataStatuses ++ metadataStatuses ++ commonMetadataStatuses).par.map { f =>
-      val parquetMetadata = ParquetFileReader.readFooter(
-        sparkContext.hadoopConfiguration, f, ParquetMetadataConverter.NO_FILTER)
-      f -> new Footer(f.getPath, parquetMetadata)
-    }.seq.toMap
-  }
+  private class MetadataCache {
+    private var metadataStatuses: Array[FileStatus] = _
+    private var commonMetadataStatuses: Array[FileStatus] = _
+    private var footers: Map[FileStatus, Footer] = _
+    private var parquetSchema: StructType = _
 
-  private val partitionSpec = {
-    val partitionDirs =
-      dataStatuses
-        .filterNot(baseStatuses.contains)
-        .map(_.getPath.getParent)
-        .distinct
-
-    // Hive uses this as part of the default partition name when the partition column value is null
-    // or empty string
-    val defaultPartitionName = parameters.getOrElse(
-      ParquetRelation2.DEFAULT_PARTITION_NAME,
-      "__HIVE_DEFAULT_PARTITION__")
-
-    if (partitionDirs.nonEmpty) {
-      ParquetRelation2.parsePartitions(partitionDirs, defaultPartitionName)
-    } else {
-      // No partition directories found, makes an empty specification
-      PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[Partition])
-    }
-  }
+    var dataStatuses: Array[FileStatus] = _
+    var partitionSpec: PartitionSpec = _
+    var schema: StructType = _
+    var dataSchemaIncludesPartitionKeys: Boolean = _
 
-  private val PartitionSpec(partitionColumns, partitions) = partitionSpec
+    def refresh(): Unit = {
+      val baseStatuses = {
+        val statuses = paths.distinct.map(p => fs.getFileStatus(fs.makeQualified(new Path(p))))
+        // Support either reading a collection of raw Parquet part-files, or a collection of folders
+        // containing Parquet files (e.g. partitioned Parquet table).
+        assert(statuses.forall(_.isFile) || statuses.forall(_.isDir))
+        statuses.toArray
+      }
 
-  private def isPartitioned = partitionColumns.nonEmpty
+      val leaves = baseStatuses.flatMap { f =>
+        val statuses = SparkHadoopUtil.get.listLeafStatuses(fs, f.getPath).filter { f =>
+          isSummaryFile(f.getPath) ||
+            !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
+        }
+        assert(statuses.nonEmpty, s"${f.getPath} is an empty folder.")
+        statuses
+      }
 
-  private val dataSchema = maybeSchema.getOrElse(readSchema())
+      dataStatuses = leaves.filterNot(f => isSummaryFile(f.getPath))
+      metadataStatuses = leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE)
+      commonMetadataStatuses =
+        leaves.filter(_.getPath.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)
+
+      footers = (dataStatuses ++ metadataStatuses ++ commonMetadataStatuses).par.map { f =>
+        val parquetMetadata = ParquetFileReader.readFooter(
+          sparkContext.hadoopConfiguration, f, ParquetMetadataConverter.NO_FILTER)
+        f -> new Footer(f.getPath, parquetMetadata)
+      }.seq.toMap
+
+      partitionSpec = {
+        val partitionDirs = dataStatuses
+          .filterNot(baseStatuses.contains)
+          .map(_.getPath.getParent)
+          .distinct
+
+        if (partitionDirs.nonEmpty) {
+          ParquetRelation2.parsePartitions(partitionDirs, defaultPartitionName)
+        } else {
+          // No partition directories found, makes an empty specification
+          PartitionSpec(StructType(Seq.empty[StructField]), Seq.empty[Partition])
+        }
+      }
 
-  private val dataSchemaIncludesPartitionKeys =
-    isPartitioned && partitionColumns.forall(f => dataSchema.fieldNames.contains(f.name))
+      parquetSchema = maybeSchema.getOrElse(readSchema())
 
-  override val schema = {
-    val fullParquetSchema = if (dataSchemaIncludesPartitionKeys) {
-      dataSchema
-    } else {
-      StructType(dataSchema.fields ++ partitionColumns.fields)
-    }
+      dataSchemaIncludesPartitionKeys =
+        isPartitioned &&
+          partitionColumns.forall(f => metadataCache.parquetSchema.fieldNames.contains(f.name))
 
-    val maybeMetastoreSchema =
-      parameters
-        .get(ParquetRelation2.METASTORE_SCHEMA)
-        .map(s => DataType.fromJson(s).asInstanceOf[StructType])
+      schema = {
+        val fullParquetSchema = if (dataSchemaIncludesPartitionKeys) {
+          metadataCache.parquetSchema
+        } else {
+          StructType(metadataCache.parquetSchema.fields ++ partitionColumns.fields)
+        }
 
-    maybeMetastoreSchema
-      .map(ParquetRelation2.mergeMetastoreParquetSchema(_, fullParquetSchema))
-      .getOrElse(fullParquetSchema)
-  }
+        maybeMetastoreSchema
+          .map(ParquetRelation2.mergeMetastoreParquetSchema(_, fullParquetSchema))
+          .getOrElse(fullParquetSchema)
+      }
+    }
 
-  private def readSchema(): StructType = {
-    // Sees which file(s) we need to touch in order to figure out the schema.
-    val filesToTouch =
+    private def readSchema(): StructType = {
+      // Sees which file(s) we need to touch in order to figure out the schema.
+      val filesToTouch =
       // Always tries the summary files first if users don't require a merged schema.  In this case,
       // "_common_metadata" is more preferable than "_metadata" because it doesn't contain row
       // groups information, and could be much smaller for large Parquet files with lots of row
@@ -212,19 +251,33 @@ case class ParquetRelation2
       // Here we tend to be pessimistic and take the second case into account.  Basically this means
       // we can't trust the summary files if users require a merged schema, and must touch all part-
       // files to do the merge.
-      if (shouldMergeSchemas) {
-        dataStatuses.toSeq
-      } else {
-        commonMetadataStatuses.headOption
-          .orElse(metadataStatuses.headOption)
-          // Summary file(s) not found, falls back to the first part-file.
-          .orElse(dataStatuses.headOption)
-          .toSeq
-      }
+        if (shouldMergeSchemas) {
+          dataStatuses.toSeq
+        } else {
+          commonMetadataStatuses.headOption
+            .orElse(metadataStatuses.headOption)
+            // Summary file(s) not found, falls back to the first part-file.
+            .orElse(dataStatuses.headOption)
+            .toSeq
+        }
 
-    ParquetRelation2.readSchema(filesToTouch.map(footers.apply), sqlContext)
+      ParquetRelation2.readSchema(filesToTouch.map(footers.apply), sqlContext)
+    }
   }
 
+  @transient private val metadataCache = new MetadataCache
+  metadataCache.refresh()
+
+  private def partitionColumns = metadataCache.partitionSpec.partitionColumns
+
+  private def partitions = metadataCache.partitionSpec.partitions
+
+  private def isPartitioned = partitionColumns.nonEmpty
+
+  private def dataSchemaIncludesPartitionKeys = metadataCache.dataSchemaIncludesPartitionKeys
+
+  override def schema = metadataCache.schema
+
   private def isSummaryFile(file: Path): Boolean = {
     file.getName == ParquetFileWriter.PARQUET_COMMON_METADATA_FILE ||
       file.getName == ParquetFileWriter.PARQUET_METADATA_FILE
@@ -233,7 +286,7 @@ case class ParquetRelation2
   // TODO Should calculate per scan size
   // It's common that a query only scans a fraction of a large Parquet file.  Returning size of the
   // whole Parquet file disables some optimizations in this case (e.g. broadcast join).
-  override val sizeInBytes = dataStatuses.map(_.getLen).sum
+  override val sizeInBytes = metadataCache.dataStatuses.map(_.getLen).sum
 
   // This is mostly a hack so that we can use the existing parquet filter code.
   override def buildScan(output: Seq[Attribute], predicates: Seq[Expression]): RDD[Row] = {
@@ -244,10 +297,10 @@ case class ParquetRelation2
     val selectedPartitions = prunePartitions(predicates, partitions)
     val selectedFiles = if (isPartitioned) {
       selectedPartitions.flatMap { p =>
-        dataStatuses.filter(_.getPath.getParent.toString == p.path)
+        metadataCache.dataStatuses.filter(_.getPath.getParent.toString == p.path)
       }
     } else {
-      dataStatuses.toSeq
+      metadataCache.dataStatuses.toSeq
     }
 
     // FileInputFormat cannot handle empty lists.
@@ -367,6 +420,89 @@ case class ParquetRelation2
       partitions
     }
   }
+
+  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
+    // TODO: currently we do not check whether the "schema"s are compatible
+    // That means if one first creates a table and then INSERTs data with
+    // and incompatible schema the execution will fail. It would be nice
+    // to catch this early one, maybe having the planner validate the schema
+    // before calling execute().
+
+    val job = new Job(sqlContext.sparkContext.hadoopConfiguration)
+    val writeSupport = if (schema.map(_.dataType).forall(_.isPrimitive)) {
+      log.debug("Initializing MutableRowWriteSupport")
+      classOf[MutableRowWriteSupport]
+    } else {
+      classOf[RowWriteSupport]
+    }
+
+    ParquetOutputFormat.setWriteSupportClass(job, writeSupport)
+
+    val conf = ContextUtil.getConfiguration(job)
+    RowWriteSupport.setSchema(schema.toAttributes, conf)
+
+    val destinationPath = new Path(paths.head)
+
+    if (overwrite) {
+      try {
+        destinationPath.getFileSystem(conf).delete(destinationPath, true)
+      } catch {
+        case e: IOException =>
+          throw new IOException(
+            s"Unable to clear output directory ${destinationPath.toString} prior" +
+              s" to writing to Parquet file:\n${e.toString}")
+      }
+    }
+
+    job.setOutputKeyClass(classOf[Void])
+    job.setOutputValueClass(classOf[Row])
+    FileOutputFormat.setOutputPath(job, destinationPath)
+
+    val wrappedConf = new SerializableWritable(job.getConfiguration)
+    val jobTrackerId = new SimpleDateFormat("yyyyMMddHHmm").format(new Date())
+    val stageId = sqlContext.sparkContext.newRddId()
+
+    val taskIdOffset = if (overwrite) {
+      1
+    } else {
+      FileSystemHelper.findMaxTaskId(
+        FileOutputFormat.getOutputPath(job).toString, job.getConfiguration) + 1
+    }
+
+    def writeShard(context: TaskContext, iterator: Iterator[Row]): Unit = {
+      /* "reduce task" <split #> <attempt # = spark task #> */
+      val attemptId = newTaskAttemptID(
+        jobTrackerId, stageId, isMap = false, context.partitionId(), context.attemptNumber())
+      val hadoopContext = newTaskAttemptContext(wrappedConf.value, attemptId)
+      val format = new AppendingParquetOutputFormat(taskIdOffset)
+      val committer = format.getOutputCommitter(hadoopContext)
+      committer.setupTask(hadoopContext)
+      val writer = format.getRecordWriter(hadoopContext)
+      try {
+        while (iterator.hasNext) {
+          val row = iterator.next()
+          writer.write(null, row)
+        }
+      } finally {
+        writer.close(hadoopContext)
+      }
+      committer.commitTask(hadoopContext)
+    }
+    val jobFormat = new AppendingParquetOutputFormat(taskIdOffset)
+    /* apparently we need a TaskAttemptID to construct an OutputCommitter;
+     * however we're only going to use this local OutputCommitter for
+     * setupJob/commitJob, so we just use a dummy "map" task.
+     */
+    val jobAttemptId = newTaskAttemptID(jobTrackerId, stageId, isMap = true, 0, 0)
+    val jobTaskContext = newTaskAttemptContext(wrappedConf.value, jobAttemptId)
+    val jobCommitter = jobFormat.getOutputCommitter(jobTaskContext)
+
+    jobCommitter.setupJob(jobTaskContext)
+    sqlContext.sparkContext.runJob(data.queryExecution.executedPlan.execute(), writeShard _)
+    jobCommitter.commitJob(jobTaskContext)
+
+    metadataCache.refresh()
+  }
 }
 
 object ParquetRelation2 {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
index 386ff2452f1a..d23ffb8b7a96 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala
@@ -18,12 +18,12 @@
 package org.apache.spark.sql.sources
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Row, Strategy}
 import org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, AttributeSet, Expression, NamedExpression}
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, InsertIntoTable => LogicalInsertIntoTable}
-import org.apache.spark.sql.execution
+import org.apache.spark.sql.catalyst.plans.logical
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.{Row, Strategy, execution}
 
 /**
  * A Strategy for planning scans over data sources defined using the sources API.
@@ -54,7 +54,7 @@ private[sql] object DataSourceStrategy extends Strategy {
     case l @ LogicalRelation(t: TableScan) =>
       execution.PhysicalRDD(l.output, t.buildScan()) :: Nil
 
-    case i @ LogicalInsertIntoTable(
+    case i @ logical.InsertIntoTable(
       l @ LogicalRelation(t: InsertableRelation), partition, query, overwrite) =>
       if (partition.nonEmpty) {
         sys.error(s"Insert into a partition is not allowed because $l is not partitioned.")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
index f1adf60efaf8..9c37e0169ff8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala
@@ -241,20 +241,16 @@ object ResolvedDataSource {
     val relation = userSpecifiedSchema match {
       case Some(schema: StructType) => {
         clazz.newInstance match {
-          case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider =>
-            dataSource
-              .asInstanceOf[org.apache.spark.sql.sources.SchemaRelationProvider]
-              .createRelation(sqlContext, new CaseInsensitiveMap(options), schema)
+          case dataSource: SchemaRelationProvider =>
+            dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options), schema)
           case dataSource: org.apache.spark.sql.sources.RelationProvider =>
             sys.error(s"${clazz.getCanonicalName} does not allow user-specified schemas.")
         }
       }
       case None => {
         clazz.newInstance match {
-          case dataSource: org.apache.spark.sql.sources.RelationProvider =>
-            dataSource
-              .asInstanceOf[org.apache.spark.sql.sources.RelationProvider]
-              .createRelation(sqlContext, new CaseInsensitiveMap(options))
+          case dataSource: RelationProvider =>
+            dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options))
           case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider =>
             sys.error(s"A schema needs to be specified when using ${clazz.getCanonicalName}.")
         }
@@ -279,10 +275,8 @@ object ResolvedDataSource {
     }
 
     val relation = clazz.newInstance match {
-      case dataSource: org.apache.spark.sql.sources.CreatableRelationProvider =>
-        dataSource
-          .asInstanceOf[org.apache.spark.sql.sources.CreatableRelationProvider]
-          .createRelation(sqlContext, options, data)
+      case dataSource: CreatableRelationProvider =>
+        dataSource.createRelation(sqlContext, options, data)
       case _ =>
         sys.error(s"${clazz.getCanonicalName} does not allow create table as select.")
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
index c8dff38f17ad..c8ebbbc7d2ea 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetIOSuite.scala
@@ -105,7 +105,8 @@ class ParquetIOSuite extends QueryTest with ParquetTest {
         sparkContext
           .parallelize(0 to 1000)
           .map(i => Tuple1(i / 100.0))
-          .select($"_1" cast decimal as "abcd")
+          // Parquet doesn't allow column names with spaces, have to add an alias here
+          .select($"_1" cast decimal as "dec")
 
       for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17))) {
         withTempPath { dir =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
index b392b3180151..48c7598343e5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -36,10 +36,10 @@ class ParquetQuerySuite extends QueryTest with ParquetTest {
     }
 
     // TODO Re-enable this after data source insertion API is merged
-    ignore(s"$prefix: appending") {
+    test(s"$prefix: appending") {
       val data = (0 until 10).map(i => (i, i.toString))
       withParquetTable(data, "t") {
-        sql("INSERT INTO t SELECT * FROM t")
+        sql("INSERT INTO TABLE t SELECT * FROM t")
         checkAnswer(table("t"), (data ++ data).map(Row.fromTuple))
       }
     }

From b6946e67c0bbf30534aa6ebfd6b5926400529a09 Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 4 Feb 2015 17:02:16 -0800
Subject: [PATCH 14/14] Fixes MiMA issues, addresses comments

---
 .../apache/spark/sql/types/dataTypes.scala    | 68 ++++++++++++++++++-
 .../spark/sql/parquet/ParquetTypes.scala      | 58 +---------------
 .../apache/spark/sql/parquet/newParquet.scala | 28 +++++---
 3 files changed, 89 insertions(+), 65 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala
index be362be55b56..91efe320546a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/dataTypes.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.types
 
 import java.sql.Timestamp
 
+import scala.collection.mutable.ArrayBuffer
 import scala.math.Numeric.{FloatAsIfIntegral, DoubleAsIfIntegral}
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.{TypeTag, runtimeMirror, typeTag}
@@ -29,6 +30,7 @@ import org.json4s.JsonAST.JValue
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
+import org.apache.spark.SparkException
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.ScalaReflectionLock
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression}
@@ -159,7 +161,6 @@ object DataType {
       case failure: NoSuccess =>
         throw new IllegalArgumentException(s"Unsupported dataType: $asString, $failure")
     }
-
   }
 
   protected[types] def buildFormattedString(
@@ -754,6 +755,57 @@ object StructType {
   def apply(fields: java.util.List[StructField]): StructType = {
     StructType(fields.toArray.asInstanceOf[Array[StructField]])
   }
+
+  private[sql] def merge(left: DataType, right: DataType): DataType =
+    (left, right) match {
+      case (ArrayType(leftElementType, leftContainsNull),
+            ArrayType(rightElementType, rightContainsNull)) =>
+        ArrayType(
+          merge(leftElementType, rightElementType),
+          leftContainsNull || rightContainsNull)
+
+      case (MapType(leftKeyType, leftValueType, leftContainsNull),
+            MapType(rightKeyType, rightValueType, rightContainsNull)) =>
+        MapType(
+          merge(leftKeyType, rightKeyType),
+          merge(leftValueType, rightValueType),
+          leftContainsNull || rightContainsNull)
+
+      case (StructType(leftFields), StructType(rightFields)) =>
+        val newFields = ArrayBuffer.empty[StructField]
+
+        leftFields.foreach {
+          case leftField @ StructField(leftName, leftType, leftNullable, _) =>
+            rightFields
+              .find(_.name == leftName)
+              .map { case rightField @ StructField(_, rightType, rightNullable, _) =>
+                leftField.copy(
+                  dataType = merge(leftType, rightType),
+                  nullable = leftNullable || rightNullable)
+              }
+              .orElse(Some(leftField))
+              .foreach(newFields += _)
+        }
+
+        rightFields
+          .filterNot(f => leftFields.map(_.name).contains(f.name))
+          .foreach(newFields += _)
+
+        StructType(newFields)
+
+      case (DecimalType.Fixed(leftPrecision, leftScale),
+            DecimalType.Fixed(rightPrecision, rightScale)) =>
+        DecimalType(leftPrecision.max(rightPrecision), leftScale.max(rightScale))
+
+      case (leftUdt: UserDefinedType[_], rightUdt: UserDefinedType[_])
+        if leftUdt.userClass == rightUdt.userClass => leftUdt
+
+      case (leftType, rightType) if leftType == rightType =>
+        leftType
+
+      case _ =>
+        throw new SparkException(s"Failed to merge incompatible data types $left and $right")
+    }
 }
 
 
@@ -890,6 +942,20 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
     val fieldTypes = fields.map(field => s"${field.name}:${field.dataType.simpleString}")
     s"struct<${fieldTypes.mkString(",")}>"
   }
+
+  /**
+   * Merges with another schema (`StructType`).  For a struct field A from `this` and a struct field
+   * B from `that`,
+   *
+   * 1. If A and B have the same name and data type, they are merged to a field C with the same name
+   *    and data type.  C is nullable if and only if either A or B is nullable.
+   * 2. If A doesn't exist in `that`, it's included in the result schema.
+   * 3. If B doesn't exist in `this`, it's also included in the result schema.
+   * 4. Otherwise, `this` and `that` are considered as conflicting schemas and an exception would be
+   *    thrown.
+   */
+  private[sql] def merge(that: StructType): StructType =
+    StructType.merge(this, that).asInstanceOf[StructType]
 }
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
index 35bbac8cf903..5209581fa835 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -284,7 +284,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
       ctype: DataType,
       name: String,
       nullable: Boolean = true,
-      inArray: Boolean = false, 
+      inArray: Boolean = false,
       toThriftSchemaNames: Boolean = false): ParquetType = {
     val repetition =
       if (inArray) {
@@ -339,7 +339,7 @@ private[parquet] object ParquetTypesConverter extends Logging {
         }
         case StructType(structFields) => {
           val fields = structFields.map {
-            field => fromDataType(field.dataType, field.name, field.nullable, 
+            field => fromDataType(field.dataType, field.name, field.nullable,
                                   inArray = false, toThriftSchemaNames)
           }
           new ParquetGroupType(repetition, name, fields.toSeq)
@@ -522,58 +522,4 @@ private[parquet] object ParquetTypesConverter extends Logging {
       attributes
     }
   }
-
-  def mergeCatalystSchemas(left: StructType, right: StructType): StructType =
-    mergeCatalystDataTypes(left, right).asInstanceOf[StructType]
-
-  def mergeCatalystDataTypes(left: DataType, right: DataType): DataType =
-    (left, right) match {
-      case (ArrayType(leftElementType, leftContainsNull),
-            ArrayType(rightElementType, rightContainsNull)) =>
-        ArrayType(
-          mergeCatalystDataTypes(leftElementType, rightElementType),
-          leftContainsNull || rightContainsNull)
-
-      case (MapType(leftKeyType, leftValueType, leftContainsNull),
-            MapType(rightKeyType, rightValueType, rightContainsNull)) =>
-        MapType(
-          mergeCatalystDataTypes(leftKeyType, rightKeyType),
-          mergeCatalystDataTypes(leftValueType, rightValueType),
-          leftContainsNull || rightContainsNull)
-
-      case (StructType(leftFields), StructType(rightFields)) =>
-        val newFields = ArrayBuffer.empty[StructField]
-
-        leftFields.foreach {
-          case leftField @ StructField(leftName, leftType, leftNullable, _) =>
-            rightFields
-              .find(_.name == leftName)
-              .map { case rightField @ StructField(_, rightType, rightNullable, _) =>
-                leftField.copy(
-                  dataType = mergeCatalystDataTypes(leftType, rightType),
-                  nullable = leftNullable || rightNullable)
-              }
-              .orElse(Some(leftField))
-              .foreach(newFields += _)
-        }
-
-        rightFields
-          .filterNot(f => leftFields.map(_.name).contains(f.name))
-          .foreach(newFields += _)
-
-        StructType(newFields)
-
-      case (DecimalType.Fixed(leftPrecision, leftScale),
-            DecimalType.Fixed(rightPrecision, rightScale)) =>
-        DecimalType(leftPrecision.max(rightPrecision), leftScale.max(rightScale))
-
-      case (leftUdt: UserDefinedType[_], rightUdt: UserDefinedType[_])
-        if leftUdt.userClass == rightUdt.userClass => leftUdt
-
-      case (leftType, rightType) if leftType == rightType =>
-        leftType
-
-      case _ =>
-        throw new SparkException(s"Failed to merge incompatible data types $left and $right")
-    }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
index 4abe754383a9..49d46334b652 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala
@@ -45,6 +45,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.parquet.ParquetTypesConverter._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType, _}
+import org.apache.spark.sql.types.StructType._
 import org.apache.spark.sql.{DataFrame, Row, SQLConf, SQLContext}
 import org.apache.spark.{Partition => SparkPartition, TaskContext, SerializableWritable, Logging, SparkException}
 
@@ -173,7 +174,7 @@ case class ParquetRelation2
         val statuses = paths.distinct.map(p => fs.getFileStatus(fs.makeQualified(new Path(p))))
         // Support either reading a collection of raw Parquet part-files, or a collection of folders
         // containing Parquet files (e.g. partitioned Parquet table).
-        assert(statuses.forall(_.isFile) || statuses.forall(_.isDir))
+        assert(statuses.forall(!_.isDir) || statuses.forall(_.isDir))
         statuses.toArray
       }
 
@@ -252,11 +253,18 @@ case class ParquetRelation2
       // we can't trust the summary files if users require a merged schema, and must touch all part-
       // files to do the merge.
         if (shouldMergeSchemas) {
-          dataStatuses.toSeq
+          // Also includes summary files, 'cause there might be empty partition directories.
+          (metadataStatuses ++ commonMetadataStatuses ++ dataStatuses).toSeq
         } else {
+          // Tries any "_common_metadata" first. Parquet files written by old versions or Parquet
+          // don't have this.
           commonMetadataStatuses.headOption
+            // Falls back to "_metadata"
             .orElse(metadataStatuses.headOption)
-            // Summary file(s) not found, falls back to the first part-file.
+            // Summary file(s) not found, the Parquet file is either corrupted, or different part-
+            // files contain conflicting user defined metadata (two or more values are associated
+            // with a same key in different files).  In either case, we fall back to any of the
+            // first part-file, and just assume all schemas are consistent.
             .orElse(dataStatuses.headOption)
             .toSeq
         }
@@ -507,14 +515,17 @@ case class ParquetRelation2
 
 object ParquetRelation2 {
   // Whether we should merge schemas collected from all Parquet part-files.
-  val MERGE_SCHEMA = "parquet.mergeSchema"
+  val MERGE_SCHEMA = "mergeSchema"
 
   // Hive Metastore schema, passed in when the Parquet relation is converted from Metastore
-  val METASTORE_SCHEMA = "parquet.metastoreSchema"
+  val METASTORE_SCHEMA = "metastoreSchema"
 
   // Default partition name to use when the partition column value is null or empty string
   val DEFAULT_PARTITION_NAME = "partition.defaultName"
 
+  // When true, the Parquet data source caches Parquet metadata for performance
+  val CACHE_METADATA = "cacheMetadata"
+
   private[parquet] def readSchema(footers: Seq[Footer], sqlContext: SQLContext): StructType = {
     footers.map { footer =>
       val metadata = footer.getParquetMetadata.getFileMetaData
@@ -535,7 +546,7 @@ object ParquetRelation2 {
             sqlContext.conf.isParquetINT96AsTimestamp))
       }
     }.reduce { (left, right) =>
-      try mergeCatalystSchemas(left, right) catch { case e: Throwable =>
+      try left.merge(right) catch { case e: Throwable =>
         throw new SparkException(s"Failed to merge incompatible schemas $left and $right", e)
       }
     }
@@ -637,14 +648,15 @@ object ParquetRelation2 {
       path: Path,
       defaultPartitionName: String): PartitionValues = {
     val columns = ArrayBuffer.empty[(String, Literal)]
-    var finished = path.isRoot
+    // Old Hadoop versions don't have `Path.isRoot`
+    var finished = path.getParent == null
     var chopped = path
 
     while (!finished) {
       val maybeColumn = parsePartitionColumn(chopped.getName, defaultPartitionName)
       maybeColumn.foreach(columns += _)
       chopped = chopped.getParent
-      finished = maybeColumn.isEmpty || chopped.isRoot
+      finished = maybeColumn.isEmpty || chopped.getParent == null
     }
 
     val (columnNames, values) = columns.reverse.unzip