Skip to content

Commit bbfaadb

Browse files
dongjoon-hyundbtsai
authored andcommitted
[SPARK-29064][CORE] Add PrometheusResource to export Executor metrics
### What changes were proposed in this pull request? At Apache Spark 3.0.0, [SPARK-23429](apache#21221) added the ability to collect executor metrics via heartbeats and to expose it as a REST API. This PR aims to extend it to support `Prometheus` format additionally. ### Why are the changes needed? Prometheus.io is a CNCF project used widely with K8s. - https://github.com/prometheus/prometheus ### Does this PR introduce any user-facing change? Yes. New web interfaces are added along with the existing JSON API. | | JSON End Point | Prometheus End Point | | ------- | ------------------------------------ | --------------------------------- | | Driver | /api/v1/applications/{id}/executors/ | /metrics/executors/prometheus/ | ### How was this patch tested? Manually connect to the new end-points with `curl` and compare with JSON. **SETUP** ``` $ sbin/start-master.sh $ sbin/start-slave.sh spark://`hostname`:7077 $ bin/spark-shell --master spark://`hostname`:7077 --conf spark.ui.prometheus.enabled=true ``` **JSON (existing after SPARK-23429)** ``` $ curl -s http://localhost:4040/api/v1/applications/app-20190911204823-0000/executors [ { "id" : "driver", "hostPort" : "localhost:52615", "isActive" : true, "rddBlocks" : 0, "memoryUsed" : 0, "diskUsed" : 0, "totalCores" : 0, "maxTasks" : 0, "activeTasks" : 0, "failedTasks" : 0, "completedTasks" : 0, "totalTasks" : 0, "totalDuration" : 0, "totalGCTime" : 0, "totalInputBytes" : 0, "totalShuffleRead" : 0, "totalShuffleWrite" : 0, "isBlacklisted" : false, "maxMemory" : 384093388, "addTime" : "2019-09-12T03:48:23.875GMT", "executorLogs" : { }, "memoryMetrics" : { "usedOnHeapStorageMemory" : 0, "usedOffHeapStorageMemory" : 0, "totalOnHeapStorageMemory" : 384093388, "totalOffHeapStorageMemory" : 0 }, "blacklistedInStages" : [ ], "peakMemoryMetrics" : { "JVMHeapMemory" : 229995952, "JVMOffHeapMemory" : 145872280, "OnHeapExecutionMemory" : 0, "OffHeapExecutionMemory" : 0, "OnHeapStorageMemory" : 0, "OffHeapStorageMemory" : 0, "OnHeapUnifiedMemory" : 0, "OffHeapUnifiedMemory" : 0, "DirectPoolMemory" : 75891, "MappedPoolMemory" : 0, "ProcessTreeJVMVMemory" : 0, "ProcessTreeJVMRSSMemory" : 0, "ProcessTreePythonVMemory" : 0, "ProcessTreePythonRSSMemory" : 0, "ProcessTreeOtherVMemory" : 0, "ProcessTreeOtherRSSMemory" : 0, "MinorGCCount" : 8, "MinorGCTime" : 82, "MajorGCCount" : 3, "MajorGCTime" : 128 }, "attributes" : { }, "resources" : { } }, { "id" : "0", "hostPort" : "127.0.0.1:52619", "isActive" : true, "rddBlocks" : 0, "memoryUsed" : 0, "diskUsed" : 0, "totalCores" : 16, "maxTasks" : 16, "activeTasks" : 0, "failedTasks" : 0, "completedTasks" : 0, "totalTasks" : 0, "totalDuration" : 0, "totalGCTime" : 0, "totalInputBytes" : 0, "totalShuffleRead" : 0, "totalShuffleWrite" : 0, "isBlacklisted" : false, "maxMemory" : 384093388, "addTime" : "2019-09-12T03:48:25.907GMT", "executorLogs" : { "stdout" : "http://127.0.0.1:8081/logPage/?appId=app-20190911204823-0000&executorId=0&logType=stdout", "stderr" : "http://127.0.0.1:8081/logPage/?appId=app-20190911204823-0000&executorId=0&logType=stderr" }, "memoryMetrics" : { "usedOnHeapStorageMemory" : 0, "usedOffHeapStorageMemory" : 0, "totalOnHeapStorageMemory" : 384093388, "totalOffHeapStorageMemory" : 0 }, "blacklistedInStages" : [ ], "attributes" : { }, "resources" : { } } ] ``` **Prometheus** ``` $ curl -s http://localhost:4040/metrics/executors/prometheus metrics_app_20190911204823_0000_driver_executor_rddBlocks_Count 0 metrics_app_20190911204823_0000_driver_executor_memoryUsed_Count 0 metrics_app_20190911204823_0000_driver_executor_diskUsed_Count 0 metrics_app_20190911204823_0000_driver_executor_totalCores_Count 0 metrics_app_20190911204823_0000_driver_executor_maxTasks_Count 0 metrics_app_20190911204823_0000_driver_executor_activeTasks_Count 0 metrics_app_20190911204823_0000_driver_executor_failedTasks_Count 0 metrics_app_20190911204823_0000_driver_executor_completedTasks_Count 0 metrics_app_20190911204823_0000_driver_executor_totalTasks_Count 0 metrics_app_20190911204823_0000_driver_executor_totalDuration_Value 0 metrics_app_20190911204823_0000_driver_executor_totalGCTime_Value 0 metrics_app_20190911204823_0000_driver_executor_totalInputBytes_Count 0 metrics_app_20190911204823_0000_driver_executor_totalShuffleRead_Count 0 metrics_app_20190911204823_0000_driver_executor_totalShuffleWrite_Count 0 metrics_app_20190911204823_0000_driver_executor_maxMemory_Count 384093388 metrics_app_20190911204823_0000_driver_executor_usedOnHeapStorageMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_usedOffHeapStorageMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_totalOnHeapStorageMemory_Count 384093388 metrics_app_20190911204823_0000_driver_executor_totalOffHeapStorageMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_JVMHeapMemory_Count 230406336 metrics_app_20190911204823_0000_driver_executor_JVMOffHeapMemory_Count 146132592 metrics_app_20190911204823_0000_driver_executor_OnHeapExecutionMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_OffHeapExecutionMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_OnHeapStorageMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_OffHeapStorageMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_OnHeapUnifiedMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_OffHeapUnifiedMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_DirectPoolMemory_Count 97049 metrics_app_20190911204823_0000_driver_executor_MappedPoolMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_ProcessTreeJVMVMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_ProcessTreeJVMRSSMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_ProcessTreePythonVMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_ProcessTreePythonRSSMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_ProcessTreeOtherVMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_ProcessTreeOtherRSSMemory_Count 0 metrics_app_20190911204823_0000_driver_executor_MinorGCCount_Count 8 metrics_app_20190911204823_0000_driver_executor_MinorGCTime_Count 82 metrics_app_20190911204823_0000_driver_executor_MajorGCCount_Count 3 metrics_app_20190911204823_0000_driver_executor_MajorGCTime_Count 128 metrics_app_20190911204823_0000_0_executor_rddBlocks_Count 0 metrics_app_20190911204823_0000_0_executor_memoryUsed_Count 0 metrics_app_20190911204823_0000_0_executor_diskUsed_Count 0 metrics_app_20190911204823_0000_0_executor_totalCores_Count 16 metrics_app_20190911204823_0000_0_executor_maxTasks_Count 16 metrics_app_20190911204823_0000_0_executor_activeTasks_Count 0 metrics_app_20190911204823_0000_0_executor_failedTasks_Count 0 metrics_app_20190911204823_0000_0_executor_completedTasks_Count 0 metrics_app_20190911204823_0000_0_executor_totalTasks_Count 0 metrics_app_20190911204823_0000_0_executor_totalDuration_Value 0 metrics_app_20190911204823_0000_0_executor_totalGCTime_Value 0 metrics_app_20190911204823_0000_0_executor_totalInputBytes_Count 0 metrics_app_20190911204823_0000_0_executor_totalShuffleRead_Count 0 metrics_app_20190911204823_0000_0_executor_totalShuffleWrite_Count 0 metrics_app_20190911204823_0000_0_executor_maxMemory_Count 384093388 metrics_app_20190911204823_0000_0_executor_usedOnHeapStorageMemory_Count 0 metrics_app_20190911204823_0000_0_executor_usedOffHeapStorageMemory_Count 0 metrics_app_20190911204823_0000_0_executor_totalOnHeapStorageMemory_Count 384093388 metrics_app_20190911204823_0000_0_executor_totalOffHeapStorageMemory_Count 0 ``` Closes apache#25770 from dongjoon-hyun/SPARK-29064. Authored-by: Dongjoon Hyun <[email protected]> Signed-off-by: DB Tsai <[email protected]>
1 parent c610de6 commit bbfaadb

File tree

3 files changed

+120
-0
lines changed

3 files changed

+120
-0
lines changed

core/src/main/scala/org/apache/spark/internal/config/UI.scala

+7
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@ private[spark] object UI {
8181
.booleanConf
8282
.createWithDefault(true)
8383

84+
val UI_PROMETHEUS_ENABLED = ConfigBuilder("spark.ui.prometheus.enabled")
85+
.internal()
86+
.doc("Expose executor metrics at /metrics/executors/prometheus. " +
87+
"For master/worker/driver metrics, you need to configure `conf/metrics.properties`.")
88+
.booleanConf
89+
.createWithDefault(false)
90+
8491
val UI_X_XSS_PROTECTION = ConfigBuilder("spark.ui.xXssProtection")
8592
.doc("Value for HTTP X-XSS-Protection response header")
8693
.stringConf
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.spark.status.api.v1
18+
19+
import javax.ws.rs._
20+
import javax.ws.rs.core.MediaType
21+
22+
import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
23+
import org.glassfish.jersey.server.ServerProperties
24+
import org.glassfish.jersey.servlet.ServletContainer
25+
26+
import org.apache.spark.ui.SparkUI
27+
28+
/**
29+
* This aims to expose Executor metrics like REST API which is documented in
30+
*
31+
* https://spark.apache.org/docs/3.0.0/monitoring.html#executor-metrics
32+
*
33+
* Note that this is based on ExecutorSummary which is different from ExecutorSource.
34+
*/
35+
@Path("/executors")
36+
private[v1] class PrometheusResource extends ApiRequestContext {
37+
@GET
38+
@Path("prometheus")
39+
@Produces(Array(MediaType.TEXT_PLAIN))
40+
def executors(): String = {
41+
val sb = new StringBuilder
42+
val store = uiRoot.asInstanceOf[SparkUI].store
43+
val appId = store.applicationInfo.id.replaceAll("[^a-zA-Z0-9]", "_")
44+
store.executorList(true).foreach { executor =>
45+
val prefix = s"metrics_${appId}_${executor.id}_executor_"
46+
sb.append(s"${prefix}rddBlocks_Count ${executor.rddBlocks}\n")
47+
sb.append(s"${prefix}memoryUsed_Count ${executor.memoryUsed}\n")
48+
sb.append(s"${prefix}diskUsed_Count ${executor.diskUsed}\n")
49+
sb.append(s"${prefix}totalCores_Count ${executor.totalCores}\n")
50+
sb.append(s"${prefix}maxTasks_Count ${executor.maxTasks}\n")
51+
sb.append(s"${prefix}activeTasks_Count ${executor.activeTasks}\n")
52+
sb.append(s"${prefix}failedTasks_Count ${executor.failedTasks}\n")
53+
sb.append(s"${prefix}completedTasks_Count ${executor.completedTasks}\n")
54+
sb.append(s"${prefix}totalTasks_Count ${executor.totalTasks}\n")
55+
sb.append(s"${prefix}totalDuration_Value ${executor.totalDuration}\n")
56+
sb.append(s"${prefix}totalGCTime_Value ${executor.totalGCTime}\n")
57+
sb.append(s"${prefix}totalInputBytes_Count ${executor.totalInputBytes}\n")
58+
sb.append(s"${prefix}totalShuffleRead_Count ${executor.totalShuffleRead}\n")
59+
sb.append(s"${prefix}totalShuffleWrite_Count ${executor.totalShuffleWrite}\n")
60+
sb.append(s"${prefix}maxMemory_Count ${executor.maxMemory}\n")
61+
executor.executorLogs.foreach { case (k, v) => }
62+
executor.memoryMetrics.foreach { m =>
63+
sb.append(s"${prefix}usedOnHeapStorageMemory_Count ${m.usedOnHeapStorageMemory}\n")
64+
sb.append(s"${prefix}usedOffHeapStorageMemory_Count ${m.usedOffHeapStorageMemory}\n")
65+
sb.append(s"${prefix}totalOnHeapStorageMemory_Count ${m.totalOnHeapStorageMemory}\n")
66+
sb.append(s"${prefix}totalOffHeapStorageMemory_Count ${m.totalOffHeapStorageMemory}\n")
67+
}
68+
executor.peakMemoryMetrics.foreach { m =>
69+
val names = Array(
70+
"JVMHeapMemory",
71+
"JVMOffHeapMemory",
72+
"OnHeapExecutionMemory",
73+
"OffHeapExecutionMemory",
74+
"OnHeapStorageMemory",
75+
"OffHeapStorageMemory",
76+
"OnHeapUnifiedMemory",
77+
"OffHeapUnifiedMemory",
78+
"DirectPoolMemory",
79+
"MappedPoolMemory",
80+
"ProcessTreeJVMVMemory",
81+
"ProcessTreeJVMRSSMemory",
82+
"ProcessTreePythonVMemory",
83+
"ProcessTreePythonRSSMemory",
84+
"ProcessTreeOtherVMemory",
85+
"ProcessTreeOtherRSSMemory",
86+
"MinorGCCount",
87+
"MinorGCTime",
88+
"MajorGCCount",
89+
"MajorGCTime"
90+
)
91+
names.foreach { name =>
92+
sb.append(s"$prefix${name}_Count ${m.getMetricValue(name)}\n")
93+
}
94+
}
95+
}
96+
sb.toString
97+
}
98+
}
99+
100+
private[spark] object PrometheusResource {
101+
def getServletHandler(uiRoot: UIRoot): ServletContextHandler = {
102+
val jerseyContext = new ServletContextHandler(ServletContextHandler.NO_SESSIONS)
103+
jerseyContext.setContextPath("/metrics")
104+
val holder: ServletHolder = new ServletHolder(classOf[ServletContainer])
105+
holder.setInitParameter(ServerProperties.PROVIDER_PACKAGES, "org.apache.spark.status.api.v1")
106+
UIRootFromServletContext.setUiRoot(jerseyContext, uiRoot)
107+
jerseyContext.addServlet(holder, "/*")
108+
jerseyContext
109+
}
110+
}

core/src/main/scala/org/apache/spark/ui/SparkUI.scala

+3
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ private[spark] class SparkUI private (
6666
addStaticHandler(SparkUI.STATIC_RESOURCE_DIR)
6767
attachHandler(createRedirectHandler("/", "/jobs/", basePath = basePath))
6868
attachHandler(ApiRootResource.getServletHandler(this))
69+
if (sc.map(_.conf.get(UI_PROMETHEUS_ENABLED)).getOrElse(false)) {
70+
attachHandler(PrometheusResource.getServletHandler(this))
71+
}
6972

7073
// These should be POST only, but, the YARN AM proxy won't proxy POSTs
7174
attachHandler(createRedirectHandler(

0 commit comments

Comments
 (0)