-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-19916][SQL] simplify bad file handling #17253
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,7 +17,7 @@ | |
|
|
||
| package org.apache.spark.sql.execution.datasources | ||
|
|
||
| import java.io.IOException | ||
| import java.io.{FileNotFoundException, IOException} | ||
|
|
||
| import scala.collection.mutable | ||
|
|
||
|
|
@@ -44,7 +44,7 @@ case class PartitionedFile( | |
| filePath: String, | ||
| start: Long, | ||
| length: Long, | ||
| locations: Array[String] = Array.empty) { | ||
| @transient locations: Array[String] = Array.empty) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need to mark it as
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not for |
||
| override def toString: String = { | ||
| s"path: $filePath, range: $start-${start + length}, partition values: $partitionValues" | ||
| } | ||
|
|
@@ -121,6 +121,20 @@ class FileScanRDD( | |
| nextElement | ||
| } | ||
|
|
||
| private def readCurrentFile(): Iterator[InternalRow] = { | ||
| try { | ||
| readFunction(currentFile) | ||
| } catch { | ||
| case e: FileNotFoundException => | ||
| throw new FileNotFoundException( | ||
| e.getMessage + "\n" + | ||
| "It is possible the underlying files have been updated. " + | ||
| "You can explicitly invalidate the cache in Spark by " + | ||
| "running 'REFRESH TABLE tableName' command in SQL or " + | ||
| "by recreating the Dataset/DataFrame involved.") | ||
| } | ||
| } | ||
|
|
||
| /** Advances to the next file. Returns true if a new non-empty iterator is available. */ | ||
| private def nextIterator(): Boolean = { | ||
| updateBytesReadWithFileSize() | ||
|
|
@@ -130,54 +144,36 @@ class FileScanRDD( | |
| // Sets InputFileBlockHolder for the file block's information | ||
| InputFileBlockHolder.set(currentFile.filePath, currentFile.start, currentFile.length) | ||
|
|
||
| try { | ||
| if (ignoreCorruptFiles) { | ||
| currentIterator = new NextIterator[Object] { | ||
| private val internalIter = { | ||
| try { | ||
| // The readFunction may read files before consuming the iterator. | ||
| // E.g., vectorized Parquet reader. | ||
| readFunction(currentFile) | ||
| } catch { | ||
| case e @(_: RuntimeException | _: IOException) => | ||
| logWarning(s"Skipped the rest content in the corrupted file: $currentFile", e) | ||
| Iterator.empty | ||
| } | ||
| } | ||
|
|
||
| override def getNext(): AnyRef = { | ||
| try { | ||
| if (internalIter.hasNext) { | ||
| internalIter.next() | ||
| } else { | ||
| finished = true | ||
| null | ||
| } | ||
| } catch { | ||
| case e: IOException => | ||
| logWarning(s"Skipped the rest content in the corrupted file: $currentFile", e) | ||
| finished = true | ||
| null | ||
| if (ignoreCorruptFiles) { | ||
| currentIterator = new NextIterator[Object] { | ||
| // The readFunction may read some bytes before consuming the iterator, e.g., | ||
| // vectorized Parquet reader. Here we use lazy val to delay the creation of | ||
| // iterator so that we will throw exception in `getNext`. | ||
| private lazy val internalIter = readCurrentFile() | ||
|
|
||
| override def getNext(): AnyRef = { | ||
| try { | ||
| if (internalIter.hasNext) { | ||
| internalIter.next() | ||
| } else { | ||
| finished = true | ||
| null | ||
| } | ||
| } catch { | ||
| // Throw FileNotFoundException even `ignoreCorruptFiles` is true | ||
| case e: FileNotFoundException => throw e | ||
| case e @ (_: RuntimeException | _: IOException) => | ||
| logWarning( | ||
| s"Skipped the rest of the content in the corrupted file: $currentFile", e) | ||
| finished = true | ||
| null | ||
| } | ||
|
|
||
| override def close(): Unit = {} | ||
| } | ||
| } else { | ||
| currentIterator = readFunction(currentFile) | ||
|
|
||
| override def close(): Unit = {} | ||
| } | ||
| } catch { | ||
| case e: IOException if ignoreCorruptFiles => | ||
| logWarning(s"Skipped the rest content in the corrupted file: $currentFile", e) | ||
| currentIterator = Iterator.empty | ||
| case e: java.io.FileNotFoundException => | ||
| throw new java.io.FileNotFoundException( | ||
| e.getMessage + "\n" + | ||
| "It is possible the underlying files have been updated. " + | ||
| "You can explicitly invalidate the cache in Spark by " + | ||
| "running 'REFRESH TABLE tableName' command in SQL or " + | ||
| "by recreating the Dataset/DataFrame involved." | ||
| ) | ||
| } else { | ||
| currentIterator = readCurrentFile() | ||
| } | ||
|
|
||
| hasNext | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No more TODO here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually we don't need to implement this method in all sub-classes. Some
FileFormatmay overridebuildReaderWithPartitionValuesdirectly(parquet), SomeFileFormatmay not be used in read path(HiveFileFormat)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@cloud-fan I am wondering why the
buildReadermethod is now marked asprotected? Maybe you can comment here: https://issues.apache.org/jira/browse/SPARK-27751