Skip to content

Commit

Permalink
feat: Spark benchmark observability (#693)
Browse files Browse the repository at this point in the history
  • Loading branch information
alanty authored Nov 1, 2024
1 parent d0b0106 commit 47405a0
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,19 @@ spec:
# Logging set to WARN
- "true"
sparkConf:
# Expose Spark metrics for Prometheus
"spark.ui.prometheus.enabled": "true"
"spark.executor.processTreeMetrics.enabled": "true"
"spark.metrics.conf.*.sink.prometheusServlet.class": "org.apache.spark.metrics.sink.PrometheusServlet"
"spark.metrics.conf.driver.sink.prometheusServlet.path": "/metrics/driver/prometheus/"
"spark.metrics.conf.executor.sink.prometheusServlet.path": "/metrics/executors/prometheus/"

# Spark Event logs
"spark.eventLog.enabled": "true"
"spark.eventLog.dir": "s3a://<S3_BUCKET>/spark-event-logs"
"spark.eventLog.rolling.enabled": "true"
"spark.eventLog.rolling.maxFileSize": "64m"

"spark.network.timeout": "2000s"
"spark.executor.heartbeatInterval": "300s"
# AQE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,19 @@ spec:
# Logging set to WARN
- "true"
sparkConf:
# Expose Spark metrics for Prometheus
"spark.ui.prometheus.enabled": "true"
"spark.executor.processTreeMetrics.enabled": "true"
"spark.metrics.conf.*.sink.prometheusServlet.class": "org.apache.spark.metrics.sink.PrometheusServlet"
"spark.metrics.conf.driver.sink.prometheusServlet.path": "/metrics/driver/prometheus/"
"spark.metrics.conf.executor.sink.prometheusServlet.path": "/metrics/executors/prometheus/"

# Spark Event logs
"spark.eventLog.enabled": "true"
"spark.eventLog.dir": "s3a://<S3_BUCKET>/spark-event-logs"
"spark.eventLog.rolling.enabled": "true"
"spark.eventLog.rolling.maxFileSize": "64m"

"spark.network.timeout": "2000s"
"spark.executor.heartbeatInterval": "300s"
# AQE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,19 @@ spec:
# Logging set to WARN
- "true"
sparkConf:
# Expose Spark metrics for Prometheus
"spark.ui.prometheus.enabled": "true"
"spark.executor.processTreeMetrics.enabled": "true"
"spark.metrics.conf.*.sink.prometheusServlet.class": "org.apache.spark.metrics.sink.PrometheusServlet"
"spark.metrics.conf.driver.sink.prometheusServlet.path": "/metrics/driver/prometheus/"
"spark.metrics.conf.executor.sink.prometheusServlet.path": "/metrics/executors/prometheus/"

# Spark Event logs
"spark.eventLog.enabled": "true"
"spark.eventLog.dir": "s3a://<S3_BUCKET>/spark-event-logs"
"spark.eventLog.rolling.enabled": "true"
"spark.eventLog.rolling.maxFileSize": "64m"

"spark.executorEnv.JAVA_HOME": "/opt/java/openjdk"
"spark.driverEnv.JAVA_HOME": "/opt/java/openjdk"
"spark.network.timeout": "2000s"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Use the official Spark base image with Java 17 and Python 3
FROM apache/spark:3.5.3-scala2.12-java17-python3-ubuntu as tpc-toolkit
FROM apache/spark:3.5.3-scala2.12-java17-python3-ubuntu

# Arguments for version control
ARG HADOOP_VERSION=3.4.1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,25 @@ prometheus:
- source_labels: [__meta_kubernetes_endpoint_port_name]
regex: http-metrics
action: keep
# Monitors for Spark Jobs
additionalPodMonitors:
- name: "spark-job-monitoring"
jobLabel: "spark-job-monitoring"
selector:
matchLabels:
spark-role: driver
namespaceSelector:
matchNames:
- spark-team-a
- spark-team-b
- spark-team-c
podMetricsEndpoints:
- port: "spark-ui"
interval: 30s
path: /metrics/driver/prometheus/
- port: "spark-ui"
interval: 30s
path: /metrics/executors/prometheus/

alertmanager:
enabled: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,25 @@ prometheus:
- source_labels: [__meta_kubernetes_endpoint_port_name]
regex: http-metrics
action: keep
# Monitors for Spark Jobs
additionalPodMonitors:
- name: "spark-job-monitoring"
jobLabel: "spark-job-monitoring"
selector:
matchLabels:
spark-role: driver
namespaceSelector:
matchNames:
- spark-team-a
- spark-team-b
- spark-team-c
podMetricsEndpoints:
- port: "spark-ui"
interval: 30s
path: /metrics/driver/prometheus/
- port: "spark-ui"
interval: 30s
path: /metrics/executors/prometheus/

alertmanager:
enabled: false
Expand Down

0 comments on commit 47405a0

Please sign in to comment.