Address comment.

viirya · viirya · commit 1563e03796a1 · 2017-01-17T05:32:55.000Z
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -217,6 +217,14 @@ class HadoopRDD[K, V](
       private val inputMetrics = context.taskMetrics().inputMetrics
       private val existingBytesRead = inputMetrics.bytesRead
 
+      // Sets InputFileBlockHolder for the file block's information
+      split.inputSplit.value match {
+        case fs: FileSplit =>
+          InputFileBlockHolder.set(fs.getPath.toString, fs.getStart, fs.getLength)
+        case _ =>
+          InputFileBlockHolder.unset()
+      }
+
       // Find a function that will return the FileSystem bytes read by this thread. Do this before
       // creating RecordReader, because RecordReader's constructor might read some bytes
       private val getBytesReadCallback: Option[() => Long] = split.inputSplit.value match {
@@ -255,23 +263,7 @@ class HadoopRDD[K, V](
       private val key: K = if (reader == null) null.asInstanceOf[K] else reader.createKey()
       private val value: V = if (reader == null) null.asInstanceOf[V] else reader.createValue()
 
-      private var setInputFileBlockHolder: Boolean = false
-
       override def getNext(): (K, V) = {
-        if (!setInputFileBlockHolder) {
-          // Sets InputFileBlockHolder for the file block's information
-          // We can't set it before consuming this iterator, otherwise some expressions which
-          // use thread local variables will fail when working with Python UDF. That is because
-          // the batch of Python UDF is running in individual thread.
-          split.inputSplit.value match {
-            case fs: FileSplit =>
-              InputFileBlockHolder.set(fs.getPath.toString, fs.getStart, fs.getLength)
-            case _ =>
-              InputFileBlockHolder.unset()
-          }
-          setInputFileBlockHolder = true
-        }
-
         try {
           finished = !reader.next(key, value)
         } catch {
diff --git a/core/src/main/scala/org/apache/spark/rdd/InputFileBlockHolder.scala b/core/src/main/scala/org/apache/spark/rdd/InputFileBlockHolder.scala
@@ -41,9 +41,10 @@ private[spark] object InputFileBlockHolder {
    * The thread variable for the name of the current file being read. This is used by
    * the InputFileName function in Spark SQL.
    */
-  private[this] val inputBlock: ThreadLocal[FileBlock] = new ThreadLocal[FileBlock] {
-    override protected def initialValue(): FileBlock = new FileBlock
-  }
+  private[this] val inputBlock: InheritableThreadLocal[FileBlock] =
+    new InheritableThreadLocal[FileBlock] {
+      override protected def initialValue(): FileBlock = new FileBlock
+    }
 
   /**
    * Returns the holding file name or empty string if it is unknown.
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -139,6 +139,14 @@ class NewHadoopRDD[K, V](
       private val inputMetrics = context.taskMetrics().inputMetrics
       private val existingBytesRead = inputMetrics.bytesRead
 
+      // Sets InputFileBlockHolder for the file block's information
+      split.serializableHadoopSplit.value match {
+        case fs: FileSplit =>
+          InputFileBlockHolder.set(fs.getPath.toString, fs.getStart, fs.getLength)
+        case _ =>
+          InputFileBlockHolder.unset()
+      }
+
       // Find a function that will return the FileSystem bytes read by this thread. Do this before
       // creating RecordReader, because RecordReader's constructor might read some bytes
       private val getBytesReadCallback: Option[() => Long] =
@@ -209,23 +217,7 @@ class NewHadoopRDD[K, V](
         !finished
       }
 
-      private var setInputFileBlockHolder: Boolean = false
-
       override def next(): (K, V) = {
-        if (!setInputFileBlockHolder) {
-          // Sets InputFileBlockHolder for the file block's information.
-          // We can't set it before consuming this iterator, otherwise some expressions which
-          // use thread local variables will fail when working with Python UDF. That is because
-          // the batch of Python UDF is running in individual thread.
-          split.serializableHadoopSplit.value match {
-            case fs: FileSplit =>
-              InputFileBlockHolder.set(fs.getPath.toString, fs.getStart, fs.getLength)
-            case _ =>
-              InputFileBlockHolder.unset()
-          }
-          setInputFileBlockHolder = true
-        }
-
         if (!hasNext) {
           throw new java.util.NoSuchElementException("End of stream")
         }