-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-38550][SQL][CORE] Use a disk-based store to save more debug information for live UI #35856
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
13eb42c
947ed17
8692b01
1ad98df
847e547
b2efff6
49b102e
4f7d651
699b6f2
e1454b9
59797e5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,12 +17,17 @@ | |
|
|
||
| package org.apache.spark.status | ||
|
|
||
| import java.io.File | ||
| import java.nio.file.Files | ||
| import java.util.{List => JList} | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
| import scala.collection.mutable.HashMap | ||
| import scala.util.control.NonFatal | ||
|
|
||
| import org.apache.spark.{JobExecutionStatus, SparkConf, SparkContext} | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.internal.config.Status.DISK_STORE_DIR_FOR_STATUS | ||
| import org.apache.spark.status.api.v1 | ||
| import org.apache.spark.storage.FallbackStorage.FALLBACK_BLOCK_MANAGER_ID | ||
| import org.apache.spark.ui.scope._ | ||
|
|
@@ -34,6 +39,7 @@ import org.apache.spark.util.kvstore.{InMemoryStore, KVStore} | |
| */ | ||
| private[spark] class AppStatusStore( | ||
| val store: KVStore, | ||
| val diskStore: Option[KVStore] = None, | ||
| val listener: Option[AppStatusListener] = None) { | ||
|
|
||
| def applicationInfo(): v1.ApplicationInfo = { | ||
|
|
@@ -755,18 +761,33 @@ private[spark] class AppStatusStore( | |
| } | ||
| } | ||
|
|
||
| private[spark] object AppStatusStore { | ||
| private[spark] object AppStatusStore extends Logging { | ||
|
|
||
| val CURRENT_VERSION = 2L | ||
|
|
||
| /** | ||
| * Create an in-memory store for a live application. | ||
| * Create an in-memory store for a live application. also create a disk store if | ||
| * the `spark.appStatusStore.diskStore.dir` is set | ||
| */ | ||
| def createLiveStore( | ||
| conf: SparkConf, | ||
| appStatusSource: Option[AppStatusSource] = None): AppStatusStore = { | ||
| val store = new ElementTrackingStore(new InMemoryStore(), conf) | ||
| val listener = new AppStatusListener(store, conf, true, appStatusSource) | ||
| new AppStatusStore(store, listener = Some(listener)) | ||
| // create a disk-based kv store if the directory is set | ||
| val diskStore = conf.get(DISK_STORE_DIR_FOR_STATUS).flatMap { storeDir => | ||
linhongliu-db marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| val storePath = Files.createDirectories( | ||
| new File(storeDir, System.currentTimeMillis().toString).toPath | ||
| ).toFile | ||
| try { | ||
| Some(KVUtils.open(storePath, AppStatusStoreMetadata(CURRENT_VERSION), conf)) | ||
| .map(new ElementTrackingStore(_, conf)) | ||
| } catch { | ||
| case NonFatal(e) => | ||
| logWarning("Failed to create disk-based app status store: ", e) | ||
| None | ||
| } | ||
| } | ||
| new AppStatusStore(store, diskStore = diskStore, listener = Some(listener)) | ||
|
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,112 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.diagnostic | ||
|
|
||
| import org.apache.spark.SparkConf | ||
| import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} | ||
| import org.apache.spark.sql.execution.ExplainMode | ||
| import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLExecutionEnd, SparkListenerSQLExecutionStart} | ||
| import org.apache.spark.sql.internal.StaticSQLConf.UI_RETAINED_EXECUTIONS | ||
| import org.apache.spark.status.{ElementTrackingStore, KVUtils} | ||
|
|
||
| /** | ||
| * A Spark listener that writes diagnostic information to a data store. The information can be | ||
| * accessed by the public REST API. | ||
| * | ||
| * @param kvStore used to store the diagnostic information | ||
| */ | ||
| class DiagnosticListener( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we add some classdoc? |
||
| conf: SparkConf, | ||
| kvStore: ElementTrackingStore) extends SparkListener { | ||
|
|
||
| kvStore.addTrigger( | ||
| classOf[ExecutionDiagnosticData], | ||
| conf.get(UI_RETAINED_EXECUTIONS)) { count => | ||
| cleanupExecutions(count) | ||
| } | ||
|
|
||
| override def onOtherEvent(event: SparkListenerEvent): Unit = event match { | ||
| case e: SparkListenerSQLExecutionStart => onExecutionStart(e) | ||
| case e: SparkListenerSQLExecutionEnd => onExecutionEnd(e) | ||
| case e: SparkListenerSQLAdaptiveExecutionUpdate => onAdaptiveExecutionUpdate(e) | ||
| case _ => // Ignore | ||
| } | ||
|
|
||
| private def onAdaptiveExecutionUpdate(event: SparkListenerSQLAdaptiveExecutionUpdate): Unit = { | ||
| val data = new AdaptiveExecutionUpdate( | ||
| event.executionId, | ||
| System.currentTimeMillis(), | ||
| event.physicalPlanDescription | ||
|
||
| ) | ||
| kvStore.write(data) | ||
| } | ||
|
|
||
| private def onExecutionStart(event: SparkListenerSQLExecutionStart): Unit = { | ||
| val sqlConf = event.qe.sparkSession.sessionState.conf | ||
| val planDescriptionMode = ExplainMode.fromString(sqlConf.uiExplainMode) | ||
| val physicalPlan = event.qe.explainString( | ||
| planDescriptionMode, sqlConf.maxToStringFieldsForDiagnostic) | ||
| val data = new ExecutionDiagnosticData( | ||
| event.executionId, | ||
| physicalPlan, | ||
| event.time, | ||
| None, | ||
| None | ||
| ) | ||
| // Check triggers since it's adding new netries | ||
| kvStore.write(data, checkTriggers = true) | ||
| } | ||
|
|
||
| private def onExecutionEnd(event: SparkListenerSQLExecutionEnd): Unit = { | ||
| try { | ||
| val existing = kvStore.read(classOf[ExecutionDiagnosticData], event.executionId) | ||
| val sqlConf = event.qe.sparkSession.sessionState.conf | ||
| val planDescriptionMode = ExplainMode.fromString(sqlConf.uiExplainMode) | ||
| val physicalPlan = event.qe.explainString( | ||
| planDescriptionMode, sqlConf.maxToStringFieldsForDiagnostic) | ||
| val data = new ExecutionDiagnosticData( | ||
| event.executionId, | ||
| physicalPlan, | ||
| existing.submissionTime, | ||
| Some(event.time), | ||
| event.executionFailure.map( | ||
| e => s"${e.getClass.getCanonicalName}: ${e.getMessage}").orElse(Some("")) | ||
| ) | ||
| kvStore.write(data) | ||
| } catch { | ||
| case _: NoSuchElementException => | ||
| // this is possibly caused by the query failed before execution. | ||
| } | ||
| } | ||
|
|
||
| private def cleanupExecutions(count: Long): Unit = { | ||
| val countToDelete = count - conf.get(UI_RETAINED_EXECUTIONS) | ||
| if (countToDelete <= 0) { | ||
| return | ||
| } | ||
| val view = kvStore.view(classOf[ExecutionDiagnosticData]).index("completionTime").first(0L) | ||
| val toDelete = KVUtils.viewToSeq(view, countToDelete.toInt)(_.completionTime.isDefined) | ||
| toDelete.foreach(e => kvStore.delete(classOf[ExecutionDiagnosticData], e.executionId)) | ||
| kvStore.removeAllByIndexValues( | ||
| classOf[AdaptiveExecutionUpdate], "id", toDelete.map(_.executionId)) | ||
| } | ||
| } | ||
|
|
||
| object DiagnosticListener { | ||
| val QUEUE_NAME = "diagnostics" | ||
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,73 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.diagnostic | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
|
|
||
| import com.fasterxml.jackson.annotation.JsonIgnore | ||
|
|
||
| import org.apache.spark.status.KVUtils.KVIndexParam | ||
| import org.apache.spark.util.kvstore.{KVIndex, KVStore} | ||
|
|
||
| /** | ||
| * Provides a view of a KVStore with methods that make it easy to query diagnostic-specific | ||
| * information. There's no state kept in this class, so it's ok to have multiple instances | ||
| * of it in an application. | ||
| */ | ||
| class DiagnosticStore(store: KVStore) { | ||
|
||
|
|
||
| def diagnosticsList(offset: Int, length: Int): Seq[ExecutionDiagnosticData] = { | ||
| store.view(classOf[ExecutionDiagnosticData]).skip(offset).max(length).asScala.toSeq | ||
| } | ||
|
|
||
| def diagnostic(executionId: Long): Option[ExecutionDiagnosticData] = { | ||
| try { | ||
| Some(store.read(classOf[ExecutionDiagnosticData], executionId)) | ||
| } catch { | ||
| case _: NoSuchElementException => None | ||
| } | ||
| } | ||
|
|
||
| def adaptiveExecutionUpdates(executionId: Long): Seq[AdaptiveExecutionUpdate] = { | ||
| store.view(classOf[AdaptiveExecutionUpdate]) | ||
| .index("updateTime") | ||
| .parent(executionId) | ||
| .asScala | ||
| .toSeq | ||
| } | ||
| } | ||
|
|
||
| /* Represents the diagnostic data of a SQL execution */ | ||
| class ExecutionDiagnosticData( | ||
| @KVIndexParam val executionId: Long, | ||
| val physicalPlan: String, | ||
| val submissionTime: Long, | ||
| val completionTime: Option[Long], | ||
| val errorMessage: Option[String]) | ||
|
|
||
| /* Represents the plan change of an adaptive execution */ | ||
| class AdaptiveExecutionUpdate( | ||
| @KVIndexParam("id") | ||
| val executionId: Long, | ||
| @KVIndexParam(value = "updateTime", parent = "id") | ||
| val updateTime: Long, | ||
| val physicalPlan: String) { | ||
|
|
||
| @JsonIgnore @KVIndex | ||
| private def naturalIndex: Array[Long] = Array(executionId, updateTime) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ import org.apache.hadoop.fs.{FsUrlStreamHandlerFactory, Path} | |
| import org.apache.spark.{SparkConf, SparkContext} | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.catalyst.catalog._ | ||
| import org.apache.spark.sql.diagnostic.DiagnosticListener | ||
| import org.apache.spark.sql.errors.QueryExecutionErrors | ||
| import org.apache.spark.sql.execution.CacheManager | ||
| import org.apache.spark.sql.execution.streaming.StreamExecution | ||
|
|
@@ -118,6 +119,12 @@ private[sql] class SharedState( | |
| statusStore | ||
| } | ||
|
|
||
| sparkContext.statusStore.diskStore.foreach { kvStore => | ||
| sparkContext.listenerBus.addToQueue( | ||
| new DiagnosticListener(conf, kvStore.asInstanceOf[ElementTrackingStore]), | ||
|
||
| DiagnosticListener.QUEUE_NAME) | ||
| } | ||
|
|
||
| /** | ||
| * A [[StreamingQueryListener]] for structured streaming ui, it contains all streaming query ui | ||
| * data to show. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FYI, currently, all disk stores are broken in Apple Silicon.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for letting me know. If the failure happens during initialization, I think we are safe here.