apache
diff --git a/‎.rat-excludes‎
Lines changed: 6 additions & 6 deletions b/‎.rat-excludes‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎assembly/pom.xml‎
Lines changed: 2 additions & 2 deletions b/‎assembly/pom.xml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/sketch/pom.xml‎
Lines changed: 2 additions & 2 deletions b/‎common/sketch/pom.xml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎core/pom.xml‎
Lines changed: 2 additions & 2 deletions b/‎core/pom.xml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala‎
Lines changed: 8 additions & 116 deletions b/‎core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala‎
Lines changed: 8 additions & 116 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala‎
Lines changed: 3 additions & 3 deletions b/‎core/src/main/scala/org/apache/spark/deploy/mesos/MesosClusterDispatcher.scala‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/RDD.scala‎
Lines changed: 19 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/rdd/RDD.scala‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala‎
Lines changed: 0 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala‎
Lines changed: 12 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/scheduler/SparkListener.scala‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala‎
Lines changed: 1 addition & 1 deletion
@@ -73,12 +73,12 @@ logs
 .*dependency-reduced-pom.xml
 known_translations
 json_expectation
-local-1422981759269/*
-local-1422981780767/*
-local-1425081759269/*
-local-1426533911241/*
-local-1426633911242/*
-local-1430917381534/*
+local-1422981759269
+local-1422981780767
+local-1425081759269
+local-1426533911241
+local-1426633911242
+local-1430917381534
 local-1430917381535_1
 local-1430917381535_2
 DESCRIPTION
 
@@ -20,13 +20,13 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.10</artifactId>
+    <artifactId>spark-parent_2.11</artifactId>
     <version>2.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <groupId>org.apache.spark</groupId>
-  <artifactId>spark-assembly_2.10</artifactId>
+  <artifactId>spark-assembly_2.11</artifactId>
   <name>Spark Project Assembly</name>
   <url>http://spark.apache.org/</url>
   <packaging>pom</packaging>
 
@@ -21,13 +21,13 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.10</artifactId>
+    <artifactId>spark-parent_2.11</artifactId>
     <version>2.0.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
   <groupId>org.apache.spark</groupId>
-  <artifactId>spark-sketch_2.10</artifactId>
+  <artifactId>spark-sketch_2.11</artifactId>
   <packaging>jar</packaging>
   <name>Spark Project Sketch</name>
   <url>http://spark.apache.org/</url>
 
@@ -20,13 +20,13 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent_2.10</artifactId>
+    <artifactId>spark-parent_2.11</artifactId>
     <version>2.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <groupId>org.apache.spark</groupId>
-  <artifactId>spark-core_2.10</artifactId>
+  <artifactId>spark-core_2.11</artifactId>
   <properties>
     <sbt.project.name>core</sbt.project.name>
   </properties>
 
@@ -248,9 +248,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       val logInfos: Seq[FileStatus] = statusList
         .filter { entry =>
           try {
-            getModificationTime(entry).map { time =>
-              time >= lastScanTime
-            }.getOrElse(false)
+            !entry.isDirectory() && (entry.getModificationTime() >= lastScanTime)
           } catch {
             case e: AccessControlException =>
               // Do not use "logInfo" since these messages can get pretty noisy if printed on
@@ -261,9 +259,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
         }
         .flatMap { entry => Some(entry) }
         .sortWith { case (entry1, entry2) =>
-          val mod1 = getModificationTime(entry1).getOrElse(-1L)
-          val mod2 = getModificationTime(entry2).getOrElse(-1L)
-          mod1 >= mod2
+          entry1.getModificationTime() >= entry2.getModificationTime()
       }
 
       logInfos.grouped(20)
@@ -341,19 +337,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
             attempt.attemptId.isEmpty || attemptId.isEmpty || attempt.attemptId.get == attemptId.get
           }.foreach { attempt =>
             val logPath = new Path(logDir, attempt.logPath)
-            // If this is a legacy directory, then add the directory to the zipStream and add
-            // each file to that directory.
-            if (isLegacyLogDirectory(fs.getFileStatus(logPath))) {
-              val files = fs.listStatus(logPath)
-              zipStream.putNextEntry(new ZipEntry(attempt.logPath + "/"))
-              zipStream.closeEntry()
-              files.foreach { file =>
-                val path = file.getPath
-                zipFileToStream(path, attempt.logPath + Path.SEPARATOR + path.getName, zipStream)
-              }
-            } else {
-              zipFileToStream(new Path(logDir, attempt.logPath), attempt.logPath, zipStream)
-            }
+            zipFileToStream(new Path(logDir, attempt.logPath), attempt.logPath, zipStream)
           }
         } finally {
           zipStream.close()
@@ -527,30 +511,24 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       bus: ReplayListenerBus): Option[FsApplicationAttemptInfo] = {
     val logPath = eventLog.getPath()
     logInfo(s"Replaying log path: $logPath")
-    val logInput =
-      if (isLegacyLogDirectory(eventLog)) {
-        openLegacyEventLog(logPath)
-      } else {
-        EventLoggingListener.openEventLog(logPath, fs)
-      }
+    val logInput = EventLoggingListener.openEventLog(logPath, fs)
     try {
       val appListener = new ApplicationEventListener
       val appCompleted = isApplicationCompleted(eventLog)
       bus.addListener(appListener)
       bus.replay(logInput, logPath.toString, !appCompleted)
 
       // Without an app ID, new logs will render incorrectly in the listing page, so do not list or
-      // try to show their UI. Some old versions of Spark generate logs without an app ID, so let
-      // logs generated by those versions go through.
-      if (appListener.appId.isDefined || !sparkVersionHasAppId(eventLog)) {
+      // try to show their UI.
+      if (appListener.appId.isDefined) {
         Some(new FsApplicationAttemptInfo(
           logPath.getName(),
           appListener.appName.getOrElse(NOT_STARTED),
           appListener.appId.getOrElse(logPath.getName()),
           appListener.appAttemptId,
           appListener.startTime.getOrElse(-1L),
           appListener.endTime.getOrElse(-1L),
-          getModificationTime(eventLog).get,
+          eventLog.getModificationTime(),
           appListener.sparkUser.getOrElse(NOT_STARTED),
           appCompleted))
       } else {
@@ -561,91 +539,11 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     }
   }
 
-  /**
-   * Loads a legacy log directory. This assumes that the log directory contains a single event
-   * log file (along with other metadata files), which is the case for directories generated by
-   * the code in previous releases.
-   *
-   * @return input stream that holds one JSON record per line.
-   */
-  private[history] def openLegacyEventLog(dir: Path): InputStream = {
-    val children = fs.listStatus(dir)
-    var eventLogPath: Path = null
-    var codecName: Option[String] = None
-
-    children.foreach { child =>
-      child.getPath().getName() match {
-        case name if name.startsWith(LOG_PREFIX) =>
-          eventLogPath = child.getPath()
-        case codec if codec.startsWith(COMPRESSION_CODEC_PREFIX) =>
-          codecName = Some(codec.substring(COMPRESSION_CODEC_PREFIX.length()))
-        case _ =>
-      }
-    }
-
-    if (eventLogPath == null) {
-      throw new IllegalArgumentException(s"$dir is not a Spark application log directory.")
-    }
-
-    val codec = try {
-        codecName.map { c => CompressionCodec.createCodec(conf, c) }
-      } catch {
-        case e: Exception =>
-          throw new IllegalArgumentException(s"Unknown compression codec $codecName.")
-      }
-
-    val in = new BufferedInputStream(fs.open(eventLogPath))
-    codec.map(_.compressedInputStream(in)).getOrElse(in)
-  }
-
-  /**
-   * Return whether the specified event log path contains a old directory-based event log.
-   * Previously, the event log of an application comprises of multiple files in a directory.
-   * As of Spark 1.3, these files are consolidated into a single one that replaces the directory.
-   * See SPARK-2261 for more detail.
-   */
-  private def isLegacyLogDirectory(entry: FileStatus): Boolean = entry.isDirectory
-
-  /**
-   * Returns the modification time of the given event log. If the status points at an empty
-   * directory, `None` is returned, indicating that there isn't an event log at that location.
-   */
-  private def getModificationTime(fsEntry: FileStatus): Option[Long] = {
-    if (isLegacyLogDirectory(fsEntry)) {
-      val statusList = fs.listStatus(fsEntry.getPath)
-      if (!statusList.isEmpty) Some(statusList.map(_.getModificationTime()).max) else None
-    } else {
-      Some(fsEntry.getModificationTime())
-    }
-  }
-
   /**
    * Return true when the application has completed.
    */
   private def isApplicationCompleted(entry: FileStatus): Boolean = {
-    if (isLegacyLogDirectory(entry)) {
-      fs.exists(new Path(entry.getPath(), APPLICATION_COMPLETE))
-    } else {
-      !entry.getPath().getName().endsWith(EventLoggingListener.IN_PROGRESS)
-    }
-  }
-
-  /**
-   * Returns whether the version of Spark that generated logs records app IDs. App IDs were added
-   * in Spark 1.1.
-   */
-  private def sparkVersionHasAppId(entry: FileStatus): Boolean = {
-    if (isLegacyLogDirectory(entry)) {
-      fs.listStatus(entry.getPath())
-        .find { status => status.getPath().getName().startsWith(SPARK_VERSION_PREFIX) }
-        .map { status =>
-          val version = status.getPath().getName().substring(SPARK_VERSION_PREFIX.length())
-          version != "1.0" && version != "1.1"
-        }
-        .getOrElse(true)
-    } else {
-      true
-    }
+    !entry.getPath().getName().endsWith(EventLoggingListener.IN_PROGRESS)
   }
 
   /**
@@ -670,12 +568,6 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
 
 private[history] object FsHistoryProvider {
   val DEFAULT_LOG_DIR = "file:/tmp/spark-events"
-
-  // Constants used to parse Spark 1.0.0 log directories.
-  val LOG_PREFIX = "EVENT_LOG_"
-  val SPARK_VERSION_PREFIX = EventLoggingListener.SPARK_VERSION_KEY + "_"
-  val COMPRESSION_CODEC_PREFIX = EventLoggingListener.COMPRESSION_CODEC_KEY + "_"
-  val APPLICATION_COMPLETE = "APPLICATION_COMPLETE"
 }
 
 private class FsApplicationAttemptInfo(
 
@@ -50,7 +50,7 @@ private[mesos] class MesosClusterDispatcher(
   extends Logging {
 
   private val publicAddress = Option(conf.getenv("SPARK_PUBLIC_DNS")).getOrElse(args.host)
-  private val recoveryMode = conf.get("spark.mesos.deploy.recoveryMode", "NONE").toUpperCase()
+  private val recoveryMode = conf.get("spark.deploy.recoveryMode", "NONE").toUpperCase()
   logInfo("Recovery mode in Mesos dispatcher set to: " + recoveryMode)
 
   private val engineFactory = recoveryMode match {
@@ -98,8 +98,8 @@ private[mesos] object MesosClusterDispatcher extends Logging {
     conf.setMaster(dispatcherArgs.masterUrl)
     conf.setAppName(dispatcherArgs.name)
     dispatcherArgs.zookeeperUrl.foreach { z =>
-      conf.set("spark.mesos.deploy.recoveryMode", "ZOOKEEPER")
-      conf.set("spark.mesos.deploy.zookeeper.url", z)
+      conf.set("spark.deploy.recoveryMode", "ZOOKEEPER")
+      conf.set("spark.deploy.zookeeper.url", z)
     }
     val dispatcher = new MesosClusterDispatcher(dispatcherArgs, conf)
     dispatcher.start()
 
@@ -1542,6 +1542,15 @@ abstract class RDD[T: ClassTag](
 
   private[spark] var checkpointData: Option[RDDCheckpointData[T]] = None
 
+  // Whether to checkpoint all ancestor RDDs that are marked for checkpointing. By default,
+  // we stop as soon as we find the first such RDD, an optimization that allows us to write
+  // less data but is not safe for all workloads. E.g. in streaming we may checkpoint both
+  // an RDD and its parent in every batch, in which case the parent may never be checkpointed
+  // and its lineage never truncated, leading to OOMs in the long run (SPARK-6847).
+  private val checkpointAllMarkedAncestors =
+    Option(sc.getLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS))
+      .map(_.toBoolean).getOrElse(false)
+
   /** Returns the first parent RDD */
   protected[spark] def firstParent[U: ClassTag]: RDD[U] = {
     dependencies.head.rdd.asInstanceOf[RDD[U]]
@@ -1585,6 +1594,13 @@ abstract class RDD[T: ClassTag](
       if (!doCheckpointCalled) {
         doCheckpointCalled = true
         if (checkpointData.isDefined) {
+          if (checkpointAllMarkedAncestors) {
+            // TODO We can collect all the RDDs that needs to be checkpointed, and then checkpoint
+            // them in parallel.
+            // Checkpoint parents first because our lineage will be truncated after we
+            // checkpoint ourselves
+            dependencies.foreach(_.rdd.doCheckpoint())
+          }
           checkpointData.get.checkpoint()
         } else {
           dependencies.foreach(_.rdd.doCheckpoint())
@@ -1704,6 +1720,9 @@ abstract class RDD[T: ClassTag](
  */
 object RDD {
 
+  private[spark] val CHECKPOINT_ALL_MARKED_ANCESTORS =
+    "spark.checkpoint.checkpointAllMarkedAncestors"
+
   // The following implicit functions were in SparkContext before 1.3 and users had to
   // `import SparkContext._` to enable them. Now we move them here to make the compiler find
   // them automatically. However, we still keep the old functions in SparkContext for backward
 
@@ -232,8 +232,6 @@ private[spark] object EventLoggingListener extends Logging {
   // Suffix applied to the names of files still being written by applications.
   val IN_PROGRESS = ".inprogress"
   val DEFAULT_LOG_DIR = "/tmp/spark-events"
-  val SPARK_VERSION_KEY = "SPARK_VERSION"
-  val COMPRESSION_CODEC_KEY = "COMPRESSION_CODEC"
 
   private val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("770", 8).toShort)
 
 
@@ -270,7 +270,7 @@ class StatsReportListener extends SparkListener with Logging {
 
   override def onStageCompleted(stageCompleted: SparkListenerStageCompleted) {
     implicit val sc = stageCompleted
-    this.logInfo("Finished stage: " + stageCompleted.stageInfo)
+    this.logInfo(s"Finished stage: ${getStatusDetail(stageCompleted.stageInfo)}")
     showMillisDistribution("task runtime:", (info, _) => Some(info.duration), taskInfoMetrics)
 
     // Shuffle write
@@ -297,6 +297,17 @@ class StatsReportListener extends SparkListener with Logging {
     taskInfoMetrics.clear()
   }
 
+  private def getStatusDetail(info: StageInfo): String = {
+    val failureReason = info.failureReason.map("(" + _ + ")").getOrElse("")
+    val timeTaken = info.submissionTime.map(
+      x => info.completionTime.getOrElse(System.currentTimeMillis()) - x
+    ).getOrElse("-")
+
+    s"Stage(${info.stageId}, ${info.attemptId}); Name: '${info.name}'; " +
+    s"Status: ${info.getStatusString}$failureReason; numTasks: ${info.numTasks}; " +
+    s"Took: $timeTaken msec"
+  }
+
 }
 
 private[spark] object StatsReportListener extends Logging {
 
@@ -179,7 +179,7 @@ private[spark] class CoarseMesosSchedulerBackend(
       .orElse(Option(System.getenv("SPARK_EXECUTOR_URI")))
 
     if (uri.isEmpty) {
-      val runScript = new File(executorSparkHome, "./bin/spark-class").getCanonicalPath
+      val runScript = new File(executorSparkHome, "./bin/spark-class").getPath
       command.setValue(
         "%s \"%s\" org.apache.spark.executor.CoarseGrainedExecutorBackend"
           .format(prefixEnv, runScript) +