Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,6 @@ abstract class PartitioningAwareFileCatalog(

private def isDataPath(path: Path): Boolean = {
val name = path.getName
!(name.startsWith("_") || name.startsWith("."))
!((name.startsWith("_") && !name.contains("=")) || name.startsWith("."))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ object HadoopFsRelation extends Logging {
// We filter everything that starts with _ and ., except _common_metadata and _metadata
// because Parquet needs to find those metadata files from leaf files returned by this method.
// We should refactor this logic to not mix metadata files with data files.
(pathName.startsWith("_") || pathName.startsWith(".")) &&
((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) &&
!pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class JsonFileFormat extends TextBasedFileFormat with DataSourceRegister {
.getOrElse(sparkSession.sessionState.conf.columnNameOfCorruptRecord)
val jsonFiles = files.filterNot { status =>
val name = status.getPath.getName
name.startsWith("_") || name.startsWith(".")
(name.startsWith("_") && !name.contains("=")) || name.startsWith(".")
Copy link
Member

@HyukjinKwon HyukjinKwon Aug 12, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm.. @liancheng @dongjoon-hyun Do you mind if I ask a question please?

If my understanding is correct, the name will be part-.. file whether the parent directory contains _ or not. So, it would be unnecessary extra checking. It is happening in ParquetFileFormat as well.

Do you mind if I open a small follow-up to clean up those?

Copy link
Member

@HyukjinKwon HyukjinKwon Aug 12, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might look inconsistent because OrcFileFormat has the similar checking here and CSVFileFormat has the similar checking here. If it looks nicer to add the condition just in case, I can make this consistent for ORC and CSV as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yea, you're right. Here files only contains leaf files and this check is redundant. Please feel free to clean it up. Thanks!

}.toArray

val jsonSchema = InferSchema.infer(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,8 @@ class ParquetFileFormat
// Lists `FileStatus`es of all leaf nodes (files) under all base directories.
val leaves = allFiles.filter { f =>
isSummaryFile(f.getPath) ||
!(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
!((f.getPath.getName.startsWith("_") && !f.getPath.getName.contains("=")) ||
f.getPath.getName.startsWith("."))
}.toArray.sortBy(_.getPath.toString)

FileTypes(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.spark.sql

import java.io.File
import java.math.MathContext
import java.sql.{Date, Timestamp}

Expand Down Expand Up @@ -2960,6 +2961,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
}
}

test("SPARK-16975: Column-partition path starting '_' should be handled correctly") {
withTempDir { dir =>
val parquetDir = new File(dir, "parquet").getCanonicalPath
spark.range(10).withColumn("_col", $"id").write.partitionBy("_col").save(parquetDir)
spark.read.parquet(parquetDir)
}
}

test("SPARK-16644: Aggregate should not put aggregate expressions to constraints") {
withTable("tbl") {
sql("CREATE TABLE tbl(a INT, b INT) USING parquet")
Expand Down