apache · juliuszsompolski · Jun 2, 2023 · Jun 2, 2023 · Jun 2, 2023 · Jun 2, 2023
diff --git a/...onnect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecutePlanHolder.scala b/...onnect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecutePlanHolder.scala
@@ -27,15 +27,16 @@ case class ExecutePlanHolder(
     sessionHolder: SessionHolder,
     request: proto.ExecutePlanRequest) {
 
-  val jobGroupId =
-    s"User_${sessionHolder.userId}_Session_${sessionHolder.sessionId}_Request_${operationId}"
+  val jobTag =
+    "SparkConnect_" +
+      s"User_${sessionHolder.userId}_Session_${sessionHolder.sessionId}_Request_${operationId}"
 
   def interrupt(): Unit = {
     // TODO/WIP: This only interrupts active Spark jobs that are actively running.
     // This would then throw the error from ExecutePlan and terminate it.
     // But if the query is not running a Spark job, but executing code on Spark driver, this
     // would be a noop and the execution will keep running.
-    sessionHolder.session.sparkContext.cancelJobGroup(jobGroupId)
+    sessionHolder.session.sparkContext.cancelJobsWithTag(jobTag)
   }
 
 }
diff --git a/...erver/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectStreamHandler.scala b/...erver/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectStreamHandler.scala
@@ -63,8 +63,12 @@ class SparkConnectStreamHandler(responseObserver: StreamObserver[ExecutePlanResp
         }
 
       val executeHolder = sessionHolder.createExecutePlanHolder(v)
+      session.sparkContext.addJobTag(executeHolder.jobTag)
+      session.sparkContext.setInterruptOnCancel(true)
+      // Also set the tag as the JobGroup for all the jobs in the query.
+      // TODO: In the long term, it should be encouraged to use job tag only.
       session.sparkContext.setJobGroup(
-        executeHolder.jobGroupId,
+        executeHolder.jobTag,
         s"Spark Connect - ${StringUtils.abbreviate(debugString, 128)}",
         interruptOnCancel = true)
 
@@ -89,6 +93,8 @@ class SparkConnectStreamHandler(responseObserver: StreamObserver[ExecutePlanResp
             throw new UnsupportedOperationException(s"${v.getPlan.getOpTypeCase} not supported.")
         }
       } finally {
+        session.sparkContext.removeJobTag(executeHolder.jobTag)
+        session.sparkContext.clearJobGroup()
         sessionHolder.removeExecutePlanHolder(executeHolder.operationId)
       }
     }

diff --git a/core/src/main/protobuf/org/apache/spark/status/protobuf/store_types.proto b/core/src/main/protobuf/org/apache/spark/status/protobuf/store_types.proto
@@ -47,6 +47,7 @@ message JobData {
   optional int64 completion_time = 5;
   repeated int64 stage_ids = 6;
   optional string job_group = 7;
+  repeated string job_tags = 21;
   JobExecutionStatus status = 8;
   int32 num_tasks = 9;
   int32 num_active_tasks = 10;

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -829,6 +829,55 @@ class SparkContext(config: SparkConf) extends Logging {
     setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, null)
   }
 
+  /**
+   * Set the behavior of job cancellation from jobs started in this thread.
+   *
+   * @param interruptOnCancel If true, then job cancellation will result in `Thread.interrupt()`
+   * being called on the job's executor threads. This is useful to help ensure that the tasks
+   * are actually stopped in a timely manner, but is off by default due to HDFS-1208, where HDFS
+   * may respond to Thread.interrupt() by marking nodes as dead.
+   */
+  def setInterruptOnCancel(interruptOnCancel: Boolean): Unit = {
+    setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, interruptOnCancel.toString)
+  }
+
+  /**
+   * Add a tag to be assigned to all the jobs started by this thread.
+   *
+   * @param tag The tag to be added. Cannot contain ',' (comma) character.
+   */
+  def addJobTag(tag: String): Unit = {
+    SparkContext.throwIfInvalidTag(tag)
+    val existingTags = getJobTags()
+    val newTags = (existingTags + tag).mkString(SparkContext.SPARK_JOB_TAGS_SEP)
+    setLocalProperty(SparkContext.SPARK_JOB_TAGS, newTags)
+  }
+
+  /**
+   * Remove a tag previously added to be assigned to all the jobs started by this thread.
+   * Noop if such a tag was not added earlier.
+   *
+   * @param tag The tag to be removed. Cannot contain ',' (comma) character.
+   */
+  def removeJobTag(tag: String): Unit = {
+    SparkContext.throwIfInvalidTag(tag)
+    val existingTags = getJobTags()
+    val newTags = (existingTags - tag).mkString(SparkContext.SPARK_JOB_TAGS_SEP)
+    setLocalProperty(SparkContext.SPARK_JOB_TAGS, newTags)
+  }
+
+  /** Get the tags that are currently set to be assigned to all the jobs started by this thread. */
+  def getJobTags(): Set[String] = {
+    Option(getLocalProperty(SparkContext.SPARK_JOB_TAGS))
+      .map(_.split(SparkContext.SPARK_JOB_TAGS_SEP).toSet)
+      .getOrElse(Set())
+  }
+
+  /** Clear the current thread's job tags. */
+  def clearJobTags(): Unit = {
+    setLocalProperty(SparkContext.SPARK_JOB_TAGS, null)
+  }
+
   /**
    * Execute a block of code in a scope such that all new RDDs created in this body will
    * be part of the same scope. For more detail, see {{org.apache.spark.rdd.RDDOperationScope}}.
@@ -2471,6 +2520,17 @@ class SparkContext(config: SparkConf) extends Logging {
     dagScheduler.cancelJobGroup(groupId)
   }
 
+  /**
+   * Cancel active jobs that have the specified tag. See `org.apache.spark.SparkContext.addJobTag`.
+   *
+   * @param tag The tag to be added. Cannot contain ',' (comma) character.
+   */
+  def cancelJobsWithTag(tag: String): Unit = {
+    SparkContext.throwIfInvalidTag(tag)
+    assertNotStopped()
+    dagScheduler.cancelJobsWithTag(tag)
+  }
+
   /** Cancel all jobs that have been scheduled or are running.  */
   def cancelAllJobs(): Unit = {
     assertNotStopped()
@@ -2840,6 +2900,7 @@ object SparkContext extends Logging {
   private[spark] val SPARK_JOB_DESCRIPTION = "spark.job.description"
   private[spark] val SPARK_JOB_GROUP_ID = "spark.jobGroup.id"
   private[spark] val SPARK_JOB_INTERRUPT_ON_CANCEL = "spark.job.interruptOnCancel"
+  private[spark] val SPARK_JOB_TAGS = "spark.job.tags"
   private[spark] val SPARK_SCHEDULER_POOL = "spark.scheduler.pool"
   private[spark] val RDD_SCOPE_KEY = "spark.rdd.scope"
   private[spark] val RDD_SCOPE_NO_OVERRIDE_KEY = "spark.rdd.scope.noOverride"
@@ -2851,6 +2912,22 @@ object SparkContext extends Logging {
    */
   private[spark] val DRIVER_IDENTIFIER = "driver"
 
+  /** Separator of tags in SPARK_JOB_TAGS property */
+  private[spark] val SPARK_JOB_TAGS_SEP = ","
+
+  private[spark] def throwIfInvalidTag(tag: String) = {
+    if (tag == null) {
+      throw new IllegalArgumentException("Spark job tag cannot be null.")
+    }
+    if (tag.contains(SPARK_JOB_TAGS_SEP)) {
+      throw new IllegalArgumentException(
+        s"Spark job tag cannot contain '$SPARK_JOB_TAGS_SEP'.")
+    }
+    if (tag.isEmpty) {
+      throw new IllegalArgumentException(
+        "Spark job tag cannot be an empty string.")
+    }
+  }
 
   private implicit def arrayToArrayWritable[T <: Writable : ClassTag](arr: Iterable[T])
     : ArrayWritable = {

diff --git a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala
@@ -52,6 +52,17 @@ class SparkStatusTracker private[spark] (sc: SparkContext, store: AppStatusStore
     store.jobsList(null).filter(_.jobGroup == expected).map(_.jobId).toArray
   }
 
+  /**
+   * Return a list of all known jobs with a particular tag.
+   *
+   * The returned list may contain running, failed, and completed jobs, and may vary across
+   * invocations of this method.  This method does not guarantee the order of the elements in
+   * its result.
+   */
+  def getJobIdsForTag(jobTag: String): Array[Int] = {
+    store.jobsList(null).filter(_.jobTags.contains(jobTag)).map(_.jobId).toArray
+  }
+
   /**
    * Returns an array containing the ids of all active stages.
    *

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1085,6 +1085,15 @@ private[spark] class DAGScheduler(
     eventProcessLoop.post(JobGroupCancelled(groupId))
   }
 
+  /**
+   * Cancel all jobs with a given tag.
+   */
+  def cancelJobsWithTag(tag: String): Unit = {
+    SparkContext.throwIfInvalidTag(tag)
+    logInfo(s"Asked to cancel jobs with tag $tag")
+    eventProcessLoop.post(JobTagCancelled(tag))
+  }
+
   /**
    * Cancel all jobs that are running or waiting in the queue.
    */
@@ -1182,6 +1191,19 @@ private[spark] class DAGScheduler(
         Option("part of cancelled job group %s".format(groupId))))
   }
 
+  private[scheduler] def handleJobTagCancelled(tag: String): Unit = {
+    // Cancel all jobs belonging that have this tag.
+    // First finds all active jobs with this group id, and then kill stages for them.
+    val jobIds = activeJobs.filter { activeJob =>
+      Option(activeJob.properties).exists { properties =>
+        Option(properties.getProperty(SparkContext.SPARK_JOB_TAGS)).getOrElse("")
+          .split(SparkContext.SPARK_JOB_TAGS_SEP).toSet.contains(tag)
+      }
+    }.map(_.jobId)
+    jobIds.foreach(handleJobCancellation(_,
+      Option(s"part of cancelled job tag $tag")))
+  }
+
   private[scheduler] def handleBeginEvent(task: Task[_], taskInfo: TaskInfo): Unit = {
     listenerBus.post(SparkListenerTaskStart(task.stageId, task.stageAttemptId, taskInfo))
   }
@@ -2972,6 +2994,9 @@ private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler
     case JobGroupCancelled(groupId) =>
       dagScheduler.handleJobGroupCancelled(groupId)
 
+    case JobTagCancelled(groupId) =>
+      dagScheduler.handleJobTagCancelled(groupId)
+
     case AllJobsCancelled =>
       dagScheduler.doCancelAllJobs()
 

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
@@ -63,6 +63,8 @@ private[scheduler] case class JobCancelled(
 
 private[scheduler] case class JobGroupCancelled(groupId: String) extends DAGSchedulerEvent
 
+private[scheduler] case class JobTagCancelled(tagName: String) extends DAGSchedulerEvent
+
 private[scheduler] case object AllJobsCancelled extends DAGSchedulerEvent
 
 private[scheduler]

diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -438,6 +438,12 @@ private[spark] class AppStatusListener(
       .flatMap { p => Option(p.getProperty(SparkContext.SPARK_JOB_DESCRIPTION)) }
     val jobGroup = Option(event.properties)
       .flatMap { p => Option(p.getProperty(SparkContext.SPARK_JOB_GROUP_ID)) }
+    val jobTags = Option(event.properties)
+      .flatMap { p => Option(p.getProperty(SparkContext.SPARK_JOB_TAGS)) }
+      .map(_.split(SparkContext.SPARK_JOB_TAGS_SEP).toSet)
+      .getOrElse(Set())
+      .toSeq
+      .sorted
     val sqlExecutionId = Option(event.properties)
       .flatMap(p => Option(p.getProperty(SQL_EXECUTION_ID_KEY)).map(_.toLong))
 
@@ -448,6 +454,7 @@ private[spark] class AppStatusListener(
       if (event.time > 0) Some(new Date(event.time)) else None,
       event.stageIds,
       jobGroup,
+      jobTags,
       numTasks,
       sqlExecutionId)
     liveJobs.put(event.jobId, job)

diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
@@ -66,6 +66,7 @@ private class LiveJob(
     val submissionTime: Option[Date],
     val stageIds: Seq[Int],
     jobGroup: Option[String],
+    jobTags: Seq[String],
     numTasks: Int,
     sqlExecutionId: Option[Long]) extends LiveEntity {
 
@@ -98,6 +99,7 @@ private class LiveJob(
       completionTime,
       stageIds,
       jobGroup,
+      jobTags,
       status,
       numTasks,
       activeTasks,

diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
@@ -199,6 +199,7 @@ class JobData private[spark](
     val completionTime: Option[Date],
     val stageIds: collection.Seq[Int],
     val jobGroup: Option[String],
+    val jobTags: collection.Seq[String],
     val status: JobExecutionStatus,
     val numTasks: Int,
     val numActiveTasks: Int,

diff --git a/core/src/main/scala/org/apache/spark/status/protobuf/JobDataWrapperSerializer.scala b/core/src/main/scala/org/apache/spark/status/protobuf/JobDataWrapperSerializer.scala
@@ -71,6 +71,7 @@ private[protobuf] class JobDataWrapperSerializer extends ProtobufSerDe[JobDataWr
     }
     jobData.stageIds.foreach(id => jobDataBuilder.addStageIds(id.toLong))
     jobData.jobGroup.foreach(jobDataBuilder.setJobGroup)
+    jobData.jobTags.foreach(jobDataBuilder.addJobTags)
     jobData.killedTasksSummary.foreach { entry =>
       jobDataBuilder.putKillTasksSummary(entry._1, entry._2)
     }
@@ -93,6 +94,7 @@ private[protobuf] class JobDataWrapperSerializer extends ProtobufSerDe[JobDataWr
       completionTime = completionTime,
       stageIds = info.getStageIdsList.asScala.map(_.toInt),
       jobGroup = jobGroup,
+      jobTags = info.getJobTagsList.asScala,
       status = status,
       numTasks = info.getNumTasks,
       numActiveTasks = info.getNumActiveTasks,

diff --git a/...ources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_1__expectation.json b/...ources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_1__expectation.json
@@ -2,6 +2,7 @@
   "jobId" : 0,
   "name" : "foreach at <console>:15",
   "stageIds" : [ 0 ],
+  "jobTags" : [ ],
   "status" : "SUCCEEDED",
   "numTasks" : 8,
   "numActiveTasks" : 0,

diff --git a/...ources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_2__expectation.json b/...ources/HistoryServerExpectations/job_list_from_multi_attempt_app_json_2__expectation.json
@@ -2,6 +2,7 @@
   "jobId" : 0,
   "name" : "foreach at <console>:15",
   "stageIds" : [ 0 ],
+  "jobTags" : [ ],
   "status" : "SUCCEEDED",
   "numTasks" : 8,
   "numActiveTasks" : 0,

diff --git a/core/src/test/resources/HistoryServerExpectations/job_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/job_list_json_expectation.json
@@ -2,6 +2,7 @@
   "jobId" : 2,
   "name" : "count at <console>:17",
   "stageIds" : [ 3 ],
+  "jobTags" : [ ],
   "status" : "SUCCEEDED",
   "numTasks" : 8,
   "numActiveTasks" : 0,
@@ -19,6 +20,7 @@
   "jobId" : 1,
   "name" : "count at <console>:20",
   "stageIds" : [ 1, 2 ],
+  "jobTags" : [ ],
   "status" : "FAILED",
   "numTasks" : 16,
   "numActiveTasks" : 0,
@@ -36,6 +38,7 @@
   "jobId" : 0,
   "name" : "count at <console>:15",
   "stageIds" : [ 0 ],
+  "jobTags" : [ ],
   "status" : "SUCCEEDED",
   "numTasks" : 8,
   "numActiveTasks" : 0,

diff --git a/core/src/test/resources/HistoryServerExpectations/one_job_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/one_job_json_expectation.json
@@ -2,6 +2,7 @@
   "jobId" : 0,
   "name" : "count at <console>:15",
   "stageIds" : [ 0 ],
+  "jobTags" : [ ],
   "status" : "SUCCEEDED",
   "numTasks" : 8,
   "numActiveTasks" : 0,

diff --git a/.../test/resources/HistoryServerExpectations/succeeded_failed_job_list_json_expectation.json b/.../test/resources/HistoryServerExpectations/succeeded_failed_job_list_json_expectation.json
@@ -2,6 +2,7 @@
   "jobId" : 2,
   "name" : "count at <console>:17",
   "stageIds" : [ 3 ],
+  "jobTags" : [ ],
   "status" : "SUCCEEDED",
   "numTasks" : 8,
   "numActiveTasks" : 0,
@@ -19,6 +20,7 @@
   "jobId" : 1,
   "name" : "count at <console>:20",
   "stageIds" : [ 1, 2 ],
+  "jobTags" : [ ],
   "status" : "FAILED",
   "numTasks" : 16,
   "numActiveTasks" : 0,
@@ -36,6 +38,7 @@
   "jobId" : 0,
   "name" : "count at <console>:15",
   "stageIds" : [ 0 ],
+  "jobTags" : [ ],
   "status" : "SUCCEEDED",
   "numTasks" : 8,
   "numActiveTasks" : 0,

diff --git a/core/src/test/resources/HistoryServerExpectations/succeeded_job_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/succeeded_job_list_json_expectation.json
@@ -2,6 +2,7 @@
   "jobId" : 2,
   "name" : "count at <console>:17",
   "stageIds" : [ 3 ],
+  "jobTags" : [ ],
   "status" : "SUCCEEDED",
   "numTasks" : 8,
   "numActiveTasks" : 0,
@@ -19,6 +20,7 @@
   "jobId" : 0,
   "name" : "count at <console>:15",
   "stageIds" : [ 0 ],
+  "jobTags" : [ ],
   "status" : "SUCCEEDED",
   "numTasks" : 8,
   "numActiveTasks" : 0,