palantir · bulldozer-bot · May 7, 2019 · Apr 5, 2019 · Apr 6, 2019 · Apr 6, 2019
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleDataCleaner.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleDataCleaner.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.shuffle;
+
+import java.io.IOException;
+
+public interface ShuffleDataCleaner {
+
+  void removeShuffleData(int shuffleId, boolean blocking) throws IOException;
+
+}
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleDataIO.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleDataIO.java
@@ -28,4 +28,5 @@
 @Experimental
 public interface ShuffleDataIO {
   ShuffleExecutorComponents executor();
+  ShuffleDriverComponents driver();
 }
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleDriverComponents.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleDriverComponents.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.shuffle;
+
+import java.io.IOException;
+
+public interface ShuffleDriverComponents {
+
+  void initializeApplication();
+
+  void cleanupApplication() throws IOException;
+
+  ShuffleDataCleaner dataCleaner();
+}
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/io/DefaultShuffleDataIO.java b/core/src/main/java/org/apache/spark/shuffle/sort/io/DefaultShuffleDataIO.java
@@ -18,8 +18,10 @@
 package org.apache.spark.shuffle.sort.io;
 
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.shuffle.ShuffleDriverComponents;
 import org.apache.spark.api.shuffle.ShuffleExecutorComponents;
 import org.apache.spark.api.shuffle.ShuffleDataIO;
+import org.apache.spark.shuffle.sort.lifecycle.DefaultShuffleDriverComponents;
 
 public class DefaultShuffleDataIO implements ShuffleDataIO {
 
@@ -33,4 +35,9 @@ public DefaultShuffleDataIO(SparkConf sparkConf) {
   public ShuffleExecutorComponents executor() {
     return new DefaultShuffleExecutorComponents(sparkConf);
   }
+
+  @Override
+  public ShuffleDriverComponents driver() {
+    return new DefaultShuffleDriverComponents();
+  }
 }
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/lifecycle/DefaultShuffleDataCleaner.java b/core/src/main/java/org/apache/spark/shuffle/sort/lifecycle/DefaultShuffleDataCleaner.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.sort.lifecycle;
+
+import org.apache.spark.api.shuffle.ShuffleDataCleaner;
+import org.apache.spark.storage.BlockManagerMaster;
+
+import java.io.IOException;
+
+public class DefaultShuffleDataCleaner implements ShuffleDataCleaner {
+
+  private final BlockManagerMaster blockManagerMaster;
+
+  public DefaultShuffleDataCleaner(
+      BlockManagerMaster blockManagerMaster) {
+    this.blockManagerMaster = blockManagerMaster;
+  }
+
+  @Override
+  public void removeShuffleData(int shuffleId, boolean blocking) throws IOException {
+    blockManagerMaster.removeShuffle(shuffleId, blocking);
+  }
+}
diff --git a/...src/main/java/org/apache/spark/shuffle/sort/lifecycle/DefaultShuffleDriverComponents.java b/...src/main/java/org/apache/spark/shuffle/sort/lifecycle/DefaultShuffleDriverComponents.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.sort.lifecycle;
+
+import org.apache.spark.SparkEnv;
+import org.apache.spark.api.shuffle.ShuffleDataCleaner;
+import org.apache.spark.api.shuffle.ShuffleDriverComponents;
+import org.apache.spark.storage.BlockManagerMaster;
+
+public class DefaultShuffleDriverComponents implements ShuffleDriverComponents {
+
+  private BlockManagerMaster blockManagerMaster;
+
+  @Override
+  public void initializeApplication() {
+    blockManagerMaster = SparkEnv.get().blockManager().master();
+  }
+
+  @Override
+  public void cleanupApplication() {
+    // do nothing
+  }
+
+  @Override
+  public ShuffleDataCleaner dataCleaner() {
+    checkInitialized();
+    return new DefaultShuffleDataCleaner(blockManagerMaster);
+  }
+
+  private void checkInitialized() {
+    if (blockManagerMaster == null) {
+      throw new IllegalStateException("Driver components must be initialized before using");
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/ContextCleaner.scala b/core/src/main/scala/org/apache/spark/ContextCleaner.scala
@@ -23,6 +23,7 @@ import java.util.concurrent.{ConcurrentHashMap, ConcurrentLinkedQueue, Scheduled
 
 import scala.collection.JavaConverters._
 
+import org.apache.spark.api.shuffle.ShuffleDataCleaner
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
@@ -58,7 +59,9 @@ private class CleanupTaskWeakReference(
  * to be processed when the associated object goes out of scope of the application. Actual
  * cleanup is performed in a separate daemon thread.
  */
-private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
+private[spark] class ContextCleaner(
+    sc: SparkContext,
+    shuffleDataCleaner: ShuffleDataCleaner) extends Logging {
 
   /**
    * A buffer to ensure that `CleanupTaskWeakReference`s are not garbage collected as long as they
@@ -222,7 +225,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
     try {
       logDebug("Cleaning shuffle " + shuffleId)
       mapOutputTrackerMaster.unregisterShuffle(shuffleId)
-      blockManagerMaster.removeShuffle(shuffleId, blocking)
+      shuffleDataCleaner.removeShuffleData(shuffleId, blocking)
       listeners.asScala.foreach(_.shuffleCleaned(shuffleId))
       logInfo("Cleaned shuffle " + shuffleId)
     } catch {
@@ -270,7 +273,6 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
     }
   }
 
-  private def blockManagerMaster = sc.env.blockManager.master
   private def broadcastManager = sc.env.broadcastManager
   private def mapOutputTrackerMaster = sc.env.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
 }

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -43,6 +43,7 @@ import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFor
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.conda.CondaEnvironment
 import org.apache.spark.api.conda.CondaEnvironment.CondaSetupInstructions
+import org.apache.spark.api.shuffle.{ShuffleDataIO, ShuffleDriverComponents}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.{CondaRunner, LocalSparkCluster, SparkHadoopUtil}
 import org.apache.spark.input.{FixedLengthBinaryInputFormat, PortableDataStream, StreamInputFormat, WholeTextFileInputFormat}
@@ -215,6 +216,7 @@ class SparkContext(config: SparkConf) extends SafeLogging {
   private var _shutdownHookRef: AnyRef = _
   private var _statusStore: AppStatusStore = _
   private var _heartbeater: Heartbeater = _
+  private var _shuffleDriverComponents: ShuffleDriverComponents = _
 
   /* ------------------------------------------------------------------------------------- *
    | Accessors and public fields. These provide access to the internal state of the        |
@@ -558,9 +560,16 @@ class SparkContext(config: SparkConf) extends SafeLogging {
       }
     _executorAllocationManager.foreach(_.start())
 
+    val configuredPluginClasses = conf.get(SHUFFLE_IO_PLUGIN_CLASS)
+    val maybeIO = Utils.loadExtensions(
+      classOf[ShuffleDataIO], Seq(configuredPluginClasses), conf)
+    require(maybeIO.size == 1, s"Failed to load plugins of type $configuredPluginClasses")
+    _shuffleDriverComponents = maybeIO.head.driver
+    _shuffleDriverComponents.initializeApplication()
+
     _cleaner =
       if (_conf.get(CLEANER_REFERENCE_TRACKING)) {
-        Some(new ContextCleaner(this))
+        Some(new ContextCleaner(this, _shuffleDriverComponents.dataCleaner()))
       } else {
         None
       }
@@ -1950,6 +1959,7 @@ class SparkContext(config: SparkConf) extends SafeLogging {
       }
       _heartbeater = null
     }
+    _shuffleDriverComponents.cleanupApplication()
     if (env != null && _heartbeatReceiver != null) {
       Utils.tryLogNonFatalError {
         env.rpcEnv.stop(_heartbeatReceiver)

diff --git a/core/src/test/scala/org/apache/spark/InternalAccumulatorSuite.scala b/core/src/test/scala/org/apache/spark/InternalAccumulatorSuite.scala
@@ -22,6 +22,7 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler.AccumulableInfo
 import org.apache.spark.shuffle.FetchFailedException
+import org.apache.spark.shuffle.sort.lifecycle.DefaultShuffleDataCleaner
 import org.apache.spark.util.{AccumulatorContext, AccumulatorV2}
 
 
@@ -210,7 +211,8 @@ class InternalAccumulatorSuite extends SparkFunSuite with LocalSparkContext {
   /**
    * A special [[ContextCleaner]] that saves the IDs of the accumulators registered for cleanup.
    */
-  private class SaveAccumContextCleaner(sc: SparkContext) extends ContextCleaner(sc) {
+  private class SaveAccumContextCleaner(sc: SparkContext) extends
+    ContextCleaner(sc, null) {
     private val accumsRegistered = new ArrayBuffer[Long]
 
     override def registerAccumulatorForCleanup(a: AccumulatorV2[_, _]): Unit = {