diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html index 37d56a06ded7..25d1a10a4d2f 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html @@ -65,7 +65,22 @@

Summary

-

Executors

+

+
+ + + + + + + + + + + +
MetricMin25th percentileMedian75th percentileMax
+
+

Executors

diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js index b7fbe0492b6d..0de70bedbf81 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js @@ -15,7 +15,7 @@ * limitations under the License. */ -/* global $, Mustache, createRESTEndPointForExecutorsPage, createRESTEndPointForMiscellaneousProcess, */ +/* global $, Mustache, createRESTEndPointForExecutorsPage, createRESTEndPointForMiscellaneousProcess, createRESTEndPointForExecutorsSummaries, */ /* global createTemplateURI, formatBytes, formatDuration, formatLogsCells, getStandAloneAppId, */ /* global jQuery, setDataTableDefaults */ @@ -143,23 +143,161 @@ var sumOptionalColumns = [3, 4]; var execOptionalColumns = [5, 6, 7, 8, 9, 10, 13, 14, 25]; var execDataTable; var sumDataTable; +var executorSummaryMetricsTableArray = []; +var executorSummaryMetricsTableCurrentStateArray = []; +var executorSummaryMetricsDataTable; function reselectCheckboxesBasedOnTaskTableState() { var allChecked = true; + var executorSummaryMetricsTableCurrentFilteredArray = executorSummaryMetricsTableCurrentStateArray.slice(); if (typeof execDataTable !== "undefined") { for (var k = 0; k < execOptionalColumns.length; k++) { if (execDataTable.column(execOptionalColumns[k]).visible()) { $("[data-exec-col-idx=" + execOptionalColumns[k] + "]").prop("checked", true); + var executorSumCheckBoxId = $("[data-exec-col-idx=" + execOptionalColumns[k] + "]").attr('exec-sum-idx'); + var selectedExecutorSummaryMetrics = executorSummaryMetricsTableArray.filter(row => (row.executorSumCheckBoxId).toString() == executorSumCheckBoxId) + for(var value in selectedExecutorSummaryMetrics) { + executorSummaryMetricsTableCurrentStateArray.push(selectedExecutorSummaryMetrics[value]); + } + executorSummaryMetricsTableCurrentFilteredArray = executorSummaryMetricsTableCurrentStateArray.slice(); } else { allChecked = false; } } } + + createDataTableForExecutorSummaryMetricsTable(executorSummaryMetricsTableCurrentFilteredArray) + if (allChecked) { $("#select-all-box").prop("checked", true); } } +var executorMetricsColumnName = { + "JVMHeapMemory": "Peak JVM Heap Memory", + "JVMOffHeapMemory": "Peak JVM Off Heap Memory", + "OnHeapExecutionMemory": "Peak On Heap Execution Memory", + "OffHeapExecutionMemory": "Peak Off Heap Execution Memory", + "OnHeapStorageMemory": "Peak On Heap Storage Memory", + "OffHeapStorageMemory": "Peak Off Heap Storage Memory", + "OnHeapUnifiedMemory": "Peak On Heap Unified Memory", + "OffHeapUnifiedMemory": "Peak Off Heap Unified Memory", + "DirectPoolMemory": "Peak Direct Pool Memory", + "MappedPoolMemory": "Peak Mapped Pool Memory", + "ProcessTreeJVMVMemory": "Process Tree JVM VMemory", + "ProcessTreeJVMRSSMemory": "Process Tree JVM RSS Memory", + "ProcessTreePythonVMemory": "Process Tree Python VMemory", + "ProcessTreePythonRSSMemory": "Process Tree Python RSS Memory", + "ProcessTreeOtherVMemory": "Process Tree Other VMemory", + "ProcessTreeOtherRSSMemory": "Process Tree Other RSS Memory", + "MinorGCCount": "Peak Minor GC Count", + "MinorGCTime": "Peak Minor GC Time", + "MajorGCCount": "Peak Major GC Count", + "MajorGCTime": "Peak Major GC Time" +} + +function createRowMetadataForColumn(colKey, data, executorSumCheckBoxId) { + var row = { + "metric": executorMetricsColumnName[colKey], + "data": data, + "executorSumCheckBoxId": executorSumCheckBoxId, + "columnKey": colKey + }; + return row; +} + +function displayRowsForSummaryMetricsTable(row, type, columnIndex) { + var str; + switch(row.columnKey) { + case 'MinorGCCount': + return row.data[columnIndex]; + + case 'MinorGCTime': + str = formatDuration(row.data[columnIndex]); + return str; + + case 'MajorGCCount': + return row.data[columnIndex]; + + case 'MajorGCTime': + str = formatDuration(row.data[columnIndex]); + return str; + + default: + str = formatBytes(row.data[columnIndex], type); + return str; + } +} + +function createDataTableForExecutorSummaryMetricsTable(executorSummaryMetricsTable) { + var executorMetricsTable = "#summary-executor-metrics-table"; + $(executorMetricsTable).hide(); + $('#executorSummaryMetricsTitle').hide(); + if ($.fn.dataTable.isDataTable(executorMetricsTable)) { + executorSummaryMetricsDataTable.clear().draw(); + executorSummaryMetricsDataTable.rows.add(executorSummaryMetricsTable).draw(); + } else { + var executorSummaryConf = { + "data": executorSummaryMetricsTable, + "columns": [ + {data : 'metric'}, + // Min + { + data: function (row, type) { + return displayRowsForSummaryMetricsTable(row, type, 0); + } + }, + // 25th percentile + { + data: function (row, type) { + return displayRowsForSummaryMetricsTable(row, type, 1); + } + }, + // Median + { + data: function (row, type) { + return displayRowsForSummaryMetricsTable(row, type, 2); + } + }, + // 75th percentile + { + data: function (row, type) { + return displayRowsForSummaryMetricsTable(row, type, 3); + } + }, + // Max + { + data: function (row, type) { + return displayRowsForSummaryMetricsTable(row, type, 4); + } + } + ], + "columnDefs": [ + { "type": "duration", "targets": 1 }, + { "type": "duration", "targets": 2 }, + { "type": "duration", "targets": 3 }, + { "type": "duration", "targets": 4 }, + { "type": "duration", "targets": 5 } + ], + "paging": false, + "info": false, + "searching": false, + "order": [[0, "asc"]], + "bSort": false, + "bAutoWidth": false, + "oLanguage": { + "sEmptyTable": "No tasks have reported metrics yet" + } + }; + executorSummaryMetricsDataTable = $(executorMetricsTable).DataTable(executorSummaryConf); + } + if (executorSummaryMetricsTable.length > 0) { + $(executorMetricsTable).show(); + $('#executorSummaryMetricsTitle').show(); + } + executorSummaryMetricsTableCurrentStateArray = executorSummaryMetricsTable.slice(); +} + $(document).ready(function () { setDataTableDefaults(); @@ -618,6 +756,145 @@ $(document).ready(function () { } }); + $('#executorSummaryMetricsTitle').append("Summary Metrics for " + "" + allExecCnt + " Executors" + ""); + $('#executorsTitle').html("Executors (" + allExecCnt + ")"); + var quantiles = "0,0.25,0.5,0.75,1.0"; + var executorSummariesEndpoint = createRESTEndPointForExecutorsSummaries(appId) + "?activeOnly=true&quantiles=" + quantiles; + $.getJSON(executorSummariesEndpoint, function(executorMetricsResponse, _ignored_status, _ignored_jqXHR) { + var taskMetricKeys = Object.keys(executorMetricsResponse); + taskMetricKeys.forEach(function (columnKey) { + var row; + switch(columnKey) { + + case "JVMHeapMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 3); + executorSummaryMetricsTableArray.push(row); + break; + + case "JVMOffHeapMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 3); + executorSummaryMetricsTableArray.push(row); + break; + + case "OnHeapExecutionMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 4); + executorSummaryMetricsTableArray.push(row); + break; + + case "OffHeapExecutionMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 4); + executorSummaryMetricsTableArray.push(row); + break; + + case "OnHeapStorageMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 5); + executorSummaryMetricsTableArray.push(row); + break; + + case "OffHeapStorageMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 5); + executorSummaryMetricsTableArray.push(row); + break; + + case "OnHeapUnifiedMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 1); + executorSummaryMetricsTableArray.push(row); + break; + + case "OffHeapUnifiedMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 2); + executorSummaryMetricsTableArray.push(row); + break; + + case "DirectPoolMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 6); + executorSummaryMetricsTableArray.push(row); + break; + + case "MappedPoolMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 6); + executorSummaryMetricsTableArray.push(row); + break; + + case "ProcessTreeJVMVMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 7); + executorSummaryMetricsTableArray.push(row); + break; + + case "ProcessTreeJVMRSSMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 7); + executorSummaryMetricsTableArray.push(row); + break; + + case "ProcessTreePythonVMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 7); + executorSummaryMetricsTableArray.push(row); + break; + + case "ProcessTreePythonRSSMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 7); + executorSummaryMetricsTableArray.push(row); + break; + + case "ProcessTreeOtherVMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 7); + executorSummaryMetricsTableArray.push(row); + break; + + case "ProcessTreeOtherRSSMemory": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 7); + executorSummaryMetricsTableArray.push(row); + break; + + case "MinorGCCount": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 8); + executorSummaryMetricsTableArray.push(row); + break; + + case "MinorGCTime": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 8); + executorSummaryMetricsTableArray.push(row); + break; + + case "MajorGCCount": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 8); + executorSummaryMetricsTableArray.push(row); + break; + + case "MajorGCTime": + row = createRowMetadataForColumn( + columnKey, executorMetricsResponse[columnKey], 8); + executorSummaryMetricsTableArray.push(row); + break; + + default: + break; + } + }); + var executorSummaryMetricsTableFilteredArray = [] + executorSummaryMetricsTableCurrentStateArray = executorSummaryMetricsTableFilteredArray.slice(); + reselectCheckboxesBasedOnTaskTableState(); + }); + var sumSelector = "#summary-execs-table"; var sumConf = { "data": [activeSummary, deadSummary, totalSummary], @@ -713,16 +990,18 @@ $(document).ready(function () { "Show Additional Metrics" + "" + "
" + - "
Select All
" + - "
On Heap Memory
" + - "
Off Heap Memory
" + - "
Peak JVM Memory OnHeap / OffHeap
" + - "
Peak Execution Memory OnHeap / OffHeap
" + - "
Peak Storage Memory OnHeap / OffHeap
" + - "
Peak Pool Memory Direct / Mapped
" + - "
Resources
" + - "
Resource Profile Id
" + - "
Exec Loss Reason
" + + "
Select All
" + + "
On Heap Memory
" + + "
Off Heap Memory
" + + "
Peak JVM Memory OnHeap / OffHeap
" + + "
Peak Execution Memory OnHeap / OffHeap
" + + "
Peak Storage Memory OnHeap / OffHeap
" + + "
Peak Pool Memory Direct / Mapped
" + + "
Peak Process Tree Memory
" + + "
Peak GC Count / Times
" + + "
Resources
" + + "
Resource Profile Id
" + + "
Exec Loss Reason
" + "
"); reselectCheckboxesBasedOnTaskTableState(); @@ -744,10 +1023,12 @@ $(document).ready(function () { $(".toggle-vis").prop("checked", true); sumColumn.visible(true); execColumn.visible(true); + createDataTableForExecutorSummaryMetricsTable(executorSummaryMetricsTableArray); } else { $(".toggle-vis").prop("checked", false); sumColumn.visible(false); execColumn.visible(false); + createDataTableForExecutorSummaryMetricsTable([]); } } else { var execColIdx = thisBox.attr("data-exec-col-idx"); @@ -758,6 +1039,21 @@ $(document).ready(function () { var sumCol = sumDataTable.column(sumColIdx); sumCol.visible(!sumCol.visible()); } + var para = thisBox.attr('exec-sum-idx'); + if(para !== '') { + var executorSummaryMetricsTableFilteredArray = [] + if (thisBox.is(":checked")) { + var selectedExecutorSummaryMetrics = executorSummaryMetricsTableArray.filter(row => (row.executorSumCheckBoxId).toString() == para) + for(var value in selectedExecutorSummaryMetrics) { + executorSummaryMetricsTableCurrentStateArray.push(selectedExecutorSummaryMetrics[value]); + } + executorSummaryMetricsTableFilteredArray = executorSummaryMetricsTableCurrentStateArray.slice(); + } else { + executorSummaryMetricsTableFilteredArray = + executorSummaryMetricsTableCurrentStateArray.filter(row => (row.executorSumCheckBoxId).toString() != para); + } + createDataTableForExecutorSummaryMetricsTable(executorSummaryMetricsTableFilteredArray); + } } }); diff --git a/core/src/main/resources/org/apache/spark/ui/static/utils.js b/core/src/main/resources/org/apache/spark/ui/static/utils.js index 1bec1c174df5..85485a2fcd5a 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/utils.js +++ b/core/src/main/resources/org/apache/spark/ui/static/utils.js @@ -205,6 +205,29 @@ function createRESTEndPointForExecutorsPage(appId) { return uiRoot + "/api/v1/applications/" + appId + "/allexecutors"; } +function createRESTEndPointForExecutorsSummaries(appId) { + var words = getBaseURI().split('/'); + var ind = words.indexOf("proxy"); + var newBaseURI; + if (ind > 0) { + appId = words[ind + 1]; + newBaseURI = words.slice(0, ind + 2).join('/'); + return newBaseURI + "/api/v1/applications/" + appId + "/executorPeakMemoryMetricsDistribution"; + } + ind = words.indexOf("history"); + if (ind > 0) { + appId = words[ind + 1]; + var attemptId = words[ind + 2]; + newBaseURI = words.slice(0, ind).join('/'); + if (isNaN(attemptId)) { + return newBaseURI + "/api/v1/applications/" + appId + "/executorPeakMemoryMetricsDistribution"; + } else { + return newBaseURI + "/api/v1/applications/" + appId + "/" + attemptId + "/executorPeakMemoryMetricsDistribution"; + } + } + return uiRoot + "/api/v1/applications/" + appId + "/executorPeakMemoryMetricsDistribution"; +} + function createRESTEndPointForMiscellaneousProcess(appId) { var words = getBaseURI().split('/'); var ind = words.indexOf("proxy"); diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala index 398cd45a6e87..7e9a1236b892 100644 --- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala +++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala @@ -452,6 +452,18 @@ private[spark] class AppStatusStore( Some(computedQuantiles) } + /** + * Calculates a summary of the executor metrics for executors, returning the + * requested quantiles for the recorded metrics. + */ + def executorMetricSummary( + activeOnly: Boolean, + unsortedQuantiles: Array[Double]): Option[v1.ExecutorPeakMetricsDistributions] = { + val quantiles = unsortedQuantiles.sorted + val executors = executorList(activeOnly).flatMap(_.peakMemoryMetrics).toIndexedSeq + Some(new v1.ExecutorPeakMetricsDistributions(quantiles, executors)) + } + /** * Whether to cache information about a specific metric quantile. We cache quantiles at every 0.05 * step, which covers the default values used both in the API and in the stages page. diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala index ef17168ebce6..1c3c3eb93168 100644 --- a/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala +++ b/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala @@ -52,6 +52,25 @@ private[v1] class AbstractApplicationResource extends BaseAppResource { @Path("executors") def executorList(): Seq[ExecutorSummary] = withUI(_.store.executorList(true)) + @GET + @Path("executorPeakMemoryMetricsDistribution") + def executorSummary( + @QueryParam("activeOnly") @DefaultValue("true") activeOnly: Boolean, + @DefaultValue("0.05,0.25,0.5,0.75,0.95") @QueryParam("quantiles") quantileString: String) + : ExecutorPeakMetricsDistributions = withUI { ui => + val quantiles = quantileString.split(",").map { s => + try { + s.toDouble + } catch { + case _: NumberFormatException => + throw new BadParameterException("quantiles", "double", s) + } + } + + ui.store.executorMetricSummary(activeOnly, quantiles).getOrElse( + throw new NotFoundException(s"No executor reported metrics yet.")) + } + @GET @Path("executors/{executorId}/threads") def threadDump(@PathParam("executorId") execId: String): Array[ThreadStackTrace] = withUI { ui => diff --git a/core/src/test/resources/HistoryServerExpectations/executor_peak_memory_metrics_distributions_expectation.json b/core/src/test/resources/HistoryServerExpectations/executor_peak_memory_metrics_distributions_expectation.json new file mode 100644 index 000000000000..a4878b9d5e5d --- /dev/null +++ b/core/src/test/resources/HistoryServerExpectations/executor_peak_memory_metrics_distributions_expectation.json @@ -0,0 +1,23 @@ +{ + "JVMHeapMemory" : [ 2.09883992E8, 4.6213568E8, 7.5947948E8, 9.8473656E8, 9.8473656E8 ], + "JVMOffHeapMemory" : [ 6.0829472E7, 6.1343616E7, 6.271752E7, 9.1926448E7, 9.1926448E7 ], + "OnHeapExecutionMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "OffHeapExecutionMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "OnHeapStorageMemory" : [ 7023.0, 12537.0, 19560.0, 19560.0, 19560.0 ], + "OffHeapStorageMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "OnHeapUnifiedMemory" : [ 7023.0, 12537.0, 19560.0, 19560.0, 19560.0 ], + "OffHeapUnifiedMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "DirectPoolMemory" : [ 10742.0, 10865.0, 12781.0, 157182.0, 157182.0 ], + "MappedPoolMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "ProcessTreeJVMVMemory" : [ 8.296026112E9, 9.678606336E9, 9.684373504E9, 9.691553792E9, 9.691553792E9 ], + "ProcessTreeJVMRSSMemory" : [ 5.26491648E8, 7.03639552E8, 9.64222976E8, 1.210867712E9, 1.210867712E9 ], + "ProcessTreePythonVMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "ProcessTreePythonRSSMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "ProcessTreeOtherVMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "ProcessTreeOtherRSSMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ], + "MinorGCCount" : [ 7.0, 15.0, 24.0, 27.0, 27.0 ], + "MinorGCTime" : [ 55.0, 106.0, 140.0, 145.0, 145.0 ], + "MajorGCCount" : [ 2.0, 2.0, 2.0, 3.0, 3.0 ], + "MajorGCTime" : [ 60.0, 63.0, 75.0, 144.0, 144.0 ], + "TotalGCTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ] +} diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala index c3bd4d880a5a..8beb93ddc865 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala @@ -184,6 +184,8 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers "executor node excludeOnFailure unexcluding" -> "applications/app-20161115172038-0000/executors", "executor memory usage" -> "applications/app-20161116163331-0000/executors", + "executor peak memory metrics distributions" -> + "applications/application_1553914137147_0018/executorPeakMemoryMetricsDistribution", "executor resource information" -> "applications/application_1555004656427_0144/executors", "multiple resource profiles" -> "applications/application_1578436911597_0052/environment", "stage list with peak metrics" -> "applications/app-20200706201101-0003/stages", diff --git a/docs/monitoring.md b/docs/monitoring.md index e54ac5414ba7..ceb861b34511 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -545,6 +545,15 @@ can be identified by their `[attempt-id]`. In the API listed below, when running + + + +
/applications/[app-id]/allexecutors A list of all(active and dead) executors for the given application.
/applications/[app-id]/executorPeakMemoryMetricsDistribution + Distributions of peak memory metrics for executors. +
?activeOnly=[true (default) | false] lists only active executors +
?quantiles summarize the metrics with the given quantiles. +
Example: ?activeOnly=false&quantiles=0.01,0.5,0.99 +
/applications/[app-id]/storage/rdd A list of stored RDDs for the given application.