1717
1818package org .apache .spark .sql .sources
1919
20- import scala .util . Try
20+ import scala .collection . mutable
2121
2222import org .apache .hadoop .conf .Configuration
23- import org .apache .hadoop .fs .{FileStatus , Path }
23+ import org .apache .hadoop .fs .{FileStatus , FileSystem , Path }
2424import org .apache .hadoop .mapreduce .{Job , TaskAttemptContext }
2525
2626import org .apache .spark .annotation .{DeveloperApi , Experimental }
27- import org .apache .spark .deploy .SparkHadoopUtil
2827import org .apache .spark .rdd .RDD
2928import org .apache .spark .sql ._
3029import org .apache .spark .sql .catalyst .expressions ._
@@ -368,18 +367,55 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
368367
369368 private var _partitionSpec : PartitionSpec = _
370369
370+ private class FileStatusCache {
371+ var leafFiles = mutable.Map .empty[Path , FileStatus ]
372+
373+ var leafDirs = mutable.Map .empty[Path , FileStatus ]
374+
375+ def refresh () = {
376+ def listLeafFilesAndDirs (fs : FileSystem , status : FileStatus ): Set [FileStatus ] = {
377+ val (dirs, files) = fs.listStatus(status.getPath).partition(_.isDir)
378+ val leafDirs = if (dirs.isEmpty) Set (status) else Set .empty[FileStatus ]
379+ files.toSet ++ leafDirs ++ dirs.flatMap(dir => listLeafFilesAndDirs(fs, dir))
380+ }
381+
382+ leafDirs.clear()
383+ leafFiles.clear()
384+
385+ val statuses = paths.flatMap { path =>
386+ val hdfsPath = new Path (path)
387+ val fs = hdfsPath.getFileSystem(hadoopConf)
388+ val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
389+ listLeafFilesAndDirs(fs, fs.getFileStatus(qualified)).filterNot { status =>
390+ val name = status.getPath.getName
391+ ! status.isDir && (name.startsWith(" _" ) || name.startsWith(" ." ))
392+ }
393+ }
394+
395+ val (dirs, files) = statuses.partition(_.isDir)
396+ leafDirs ++= dirs.map(d => d.getPath -> d).toMap
397+ leafFiles ++= files.map(f => f.getPath -> f).toMap
398+ }
399+ }
400+
401+ private lazy val fileStatusCache = {
402+ val cache = new FileStatusCache
403+ cache.refresh()
404+ cache
405+ }
406+
371407 final private [sql] def partitionSpec : PartitionSpec = {
372408 if (_partitionSpec == null ) {
373409 _partitionSpec = maybePartitionSpec
374410 .map(spec => spec.copy(partitionColumns = spec.partitionColumns.asNullable))
375411 .orElse(userDefinedPartitionColumns.map(PartitionSpec (_, Array .empty[Partition ])))
376412 .getOrElse {
377- if (sqlContext.conf.partitionDiscoveryEnabled()) {
378- discoverPartitions()
379- } else {
380- PartitionSpec (StructType (Nil ), Array .empty[Partition ])
413+ if (sqlContext.conf.partitionDiscoveryEnabled()) {
414+ discoverPartitions()
415+ } else {
416+ PartitionSpec (StructType (Nil ), Array .empty[Partition ])
417+ }
381418 }
382- }
383419 }
384420 _partitionSpec
385421 }
@@ -409,20 +445,14 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
409445 def userDefinedPartitionColumns : Option [StructType ] = None
410446
411447 private [sql] def refresh (): Unit = {
448+ fileStatusCache.refresh()
412449 if (sqlContext.conf.partitionDiscoveryEnabled()) {
413450 _partitionSpec = discoverPartitions()
414451 }
415452 }
416453
417454 private def discoverPartitions (): PartitionSpec = {
418- val basePaths = paths.map(new Path (_))
419- val leafDirs = basePaths.flatMap { path =>
420- val fs = path.getFileSystem(hadoopConf)
421- Try (fs.getFileStatus(path.makeQualified(fs.getUri, fs.getWorkingDirectory)))
422- .filter(_.isDir)
423- .map(SparkHadoopUtil .get.listLeafDirStatuses(fs, _))
424- .getOrElse(Seq .empty[FileStatus ])
425- }.map(_.getPath)
455+ val leafDirs = fileStatusCache.leafDirs.keys.toSeq
426456
427457 if (leafDirs.nonEmpty) {
428458 PartitioningUtils .parsePartitions(leafDirs, PartitioningUtils .DEFAULT_PARTITION_NAME )
@@ -444,6 +474,16 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
444474 })
445475 }
446476
477+ private [sources] final def buildScan (
478+ requiredColumns : Array [String ],
479+ filters : Array [Filter ],
480+ inputPaths : Array [String ]): RDD [Row ] = {
481+ val inputStatuses = inputPaths.flatMap { path =>
482+ fileStatusCache.leafFiles.values.filter(_.getPath.getParent == new Path (path))
483+ }
484+ buildScan(requiredColumns, filters, inputStatuses)
485+ }
486+
447487 /**
448488 * Specifies schema of actual data files. For partitioned relations, if one or more partitioned
449489 * columns are contained in the data files, they should also appear in `dataSchema`.
@@ -457,13 +497,13 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
457497 * this relation. For partitioned relations, this method is called for each selected partition,
458498 * and builds an `RDD[Row]` containing all rows within that single partition.
459499 *
460- * @param inputPaths For a non-partitioned relation, it contains paths of all data files in the
500+ * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
461501 * relation. For a partitioned relation, it contains paths of all data files in a single
462502 * selected partition.
463503 *
464504 * @since 1.4.0
465505 */
466- def buildScan (inputPaths : Array [String ]): RDD [Row ] = {
506+ def buildScan (inputFiles : Array [FileStatus ]): RDD [Row ] = {
467507 throw new UnsupportedOperationException (
468508 " At least one buildScan() method should be overridden to read the relation." )
469509 }
@@ -474,13 +514,13 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
474514 * and builds an `RDD[Row]` containing all rows within that single partition.
475515 *
476516 * @param requiredColumns Required columns.
477- * @param inputPaths For a non-partitioned relation, it contains paths of all data files in the
517+ * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
478518 * relation. For a partitioned relation, it contains paths of all data files in a single
479519 * selected partition.
480520 *
481521 * @since 1.4.0
482522 */
483- def buildScan (requiredColumns : Array [String ], inputPaths : Array [String ]): RDD [Row ] = {
523+ def buildScan (requiredColumns : Array [String ], inputFiles : Array [FileStatus ]): RDD [Row ] = {
484524 // Yeah, to workaround serialization...
485525 val dataSchema = this .dataSchema
486526 val codegenEnabled = this .codegenEnabled
@@ -490,7 +530,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
490530 BoundReference (dataSchema.fieldIndex(col), field.dataType, field.nullable)
491531 }.toSeq
492532
493- buildScan(inputPaths ).mapPartitions { rows =>
533+ buildScan(inputFiles ).mapPartitions { rows =>
494534 val buildProjection = if (codegenEnabled) {
495535 GenerateMutableProjection .generate(requiredOutput, dataSchema.toAttributes)
496536 } else {
@@ -512,7 +552,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
512552 * of all `filters`. The pushed down filters are currently purely an optimization as they
513553 * will all be evaluated again. This means it is safe to use them with methods that produce
514554 * false positives such as filtering partitions based on a bloom filter.
515- * @param inputPaths For a non-partitioned relation, it contains paths of all data files in the
555+ * @param inputFiles For a non-partitioned relation, it contains paths of all data files in the
516556 * relation. For a partitioned relation, it contains paths of all data files in a single
517557 * selected partition.
518558 *
@@ -521,8 +561,8 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio
521561 def buildScan (
522562 requiredColumns : Array [String ],
523563 filters : Array [Filter ],
524- inputPaths : Array [String ]): RDD [Row ] = {
525- buildScan(requiredColumns, inputPaths )
564+ inputFiles : Array [FileStatus ]): RDD [Row ] = {
565+ buildScan(requiredColumns, inputFiles )
526566 }
527567
528568 /**
0 commit comments