From 6a74ca2dbca11ac330901d2e6a65ba8615509041 Mon Sep 17 00:00:00 2001 From: Alan Tyson Date: Fri, 1 Nov 2024 05:02:01 -0700 Subject: [PATCH 1/3] enable spark metrics and eventlogs --- .../examples/benchmark/tpcds-benchmark-1t-ebs.yaml | 13 +++++++++++++ .../examples/benchmark/tpcds-benchmark-1t-ssd.yaml | 13 +++++++++++++ .../tpcds-benchmark-data-generation-1t.yaml | 13 +++++++++++++ 3 files changed, 39 insertions(+) diff --git a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-1t-ebs.yaml b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-1t-ebs.yaml index b0fb79379..69ef9e04c 100644 --- a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-1t-ebs.yaml +++ b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-1t-ebs.yaml @@ -47,6 +47,19 @@ spec: # Logging set to WARN - "true" sparkConf: + # Expose Spark metrics for Prometheus + "spark.ui.prometheus.enabled": "true" + "spark.executor.processTreeMetrics.enabled": "true" + "spark.metrics.conf.*.sink.prometheusServlet.class": "org.apache.spark.metrics.sink.PrometheusServlet" + "spark.metrics.conf.driver.sink.prometheusServlet.path": "/metrics/driver/prometheus/" + "spark.metrics.conf.executor.sink.prometheusServlet.path": "/metrics/executors/prometheus/" + + # Spark Event logs + "spark.eventLog.enabled": "true" + "spark.eventLog.dir": "s3a:///spark-event-logs" + "spark.eventLog.rolling.enabled": "true" + "spark.eventLog.rolling.maxFileSize": "64m" + "spark.network.timeout": "2000s" "spark.executor.heartbeatInterval": "300s" # AQE diff --git a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-1t-ssd.yaml b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-1t-ssd.yaml index 7bc5a0566..008e1d165 100644 --- a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-1t-ssd.yaml +++ b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-1t-ssd.yaml @@ -47,6 +47,19 @@ spec: # Logging set to WARN - "true" sparkConf: + # Expose Spark metrics for Prometheus + "spark.ui.prometheus.enabled": "true" + "spark.executor.processTreeMetrics.enabled": "true" + "spark.metrics.conf.*.sink.prometheusServlet.class": "org.apache.spark.metrics.sink.PrometheusServlet" + "spark.metrics.conf.driver.sink.prometheusServlet.path": "/metrics/driver/prometheus/" + "spark.metrics.conf.executor.sink.prometheusServlet.path": "/metrics/executors/prometheus/" + + # Spark Event logs + "spark.eventLog.enabled": "true" + "spark.eventLog.dir": "s3a:///spark-event-logs" + "spark.eventLog.rolling.enabled": "true" + "spark.eventLog.rolling.maxFileSize": "64m" + "spark.network.timeout": "2000s" "spark.executor.heartbeatInterval": "300s" # AQE diff --git a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml index 2d2ec32b5..d304611dc 100644 --- a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml +++ b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml @@ -38,6 +38,19 @@ spec: # Logging set to WARN - "true" sparkConf: + # Expose Spark metrics for Prometheus + "spark.ui.prometheus.enabled": "true" + "spark.executor.processTreeMetrics.enabled": "true" + "spark.metrics.conf.*.sink.prometheusServlet.class": "org.apache.spark.metrics.sink.PrometheusServlet" + "spark.metrics.conf.driver.sink.prometheusServlet.path": "/metrics/driver/prometheus/" + "spark.metrics.conf.executor.sink.prometheusServlet.path": "/metrics/executors/prometheus/" + + # Spark Event logs + "spark.eventLog.enabled": "true" + "spark.eventLog.dir": "s3a:///spark-event-logs" + "spark.eventLog.rolling.enabled": "true" + "spark.eventLog.rolling.maxFileSize": "64m" + "spark.executorEnv.JAVA_HOME": "/opt/java/openjdk" "spark.driverEnv.JAVA_HOME": "/opt/java/openjdk" "spark.network.timeout": "2000s" From 4261a1a4eab01ccd9b635f6e79eca61b037aa128 Mon Sep 17 00:00:00 2001 From: Alan Tyson Date: Fri, 1 Nov 2024 05:02:14 -0700 Subject: [PATCH 2/3] cleanup Dockerfile --- .../spark-k8s-operator/examples/docker/Dockerfile-benchmark | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analytics/terraform/spark-k8s-operator/examples/docker/Dockerfile-benchmark b/analytics/terraform/spark-k8s-operator/examples/docker/Dockerfile-benchmark index addcf3171..53ee14e39 100644 --- a/analytics/terraform/spark-k8s-operator/examples/docker/Dockerfile-benchmark +++ b/analytics/terraform/spark-k8s-operator/examples/docker/Dockerfile-benchmark @@ -1,5 +1,5 @@ # Use the official Spark base image with Java 17 and Python 3 -FROM apache/spark:3.5.3-scala2.12-java17-python3-ubuntu as tpc-toolkit +FROM apache/spark:3.5.3-scala2.12-java17-python3-ubuntu # Arguments for version control ARG HADOOP_VERSION=3.4.1 From 438ec788a036798f19ef4ecf13a58908a8003451 Mon Sep 17 00:00:00 2001 From: Alan Tyson Date: Fri, 1 Nov 2024 05:02:35 -0700 Subject: [PATCH 3/3] add PodMonitor to scrape spark metrics --- .../kube-prometheus-amp-enable.yaml | 19 +++++++++++++++++++ .../helm-values/kube-prometheus.yaml | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus-amp-enable.yaml b/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus-amp-enable.yaml index 9fc35ea55..112ea39ef 100644 --- a/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus-amp-enable.yaml +++ b/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus-amp-enable.yaml @@ -51,6 +51,25 @@ prometheus: - source_labels: [__meta_kubernetes_endpoint_port_name] regex: http-metrics action: keep + # Monitors for Spark Jobs + additionalPodMonitors: + - name: "spark-job-monitoring" + jobLabel: "spark-job-monitoring" + selector: + matchLabels: + spark-role: driver + namespaceSelector: + matchNames: + - spark-team-a + - spark-team-b + - spark-team-c + podMetricsEndpoints: + - port: "spark-ui" + interval: 30s + path: /metrics/driver/prometheus/ + - port: "spark-ui" + interval: 30s + path: /metrics/executors/prometheus/ alertmanager: enabled: false diff --git a/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus.yaml b/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus.yaml index fdc406518..899292abc 100644 --- a/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus.yaml +++ b/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus.yaml @@ -38,6 +38,25 @@ prometheus: - source_labels: [__meta_kubernetes_endpoint_port_name] regex: http-metrics action: keep + # Monitors for Spark Jobs + additionalPodMonitors: + - name: "spark-job-monitoring" + jobLabel: "spark-job-monitoring" + selector: + matchLabels: + spark-role: driver + namespaceSelector: + matchNames: + - spark-team-a + - spark-team-b + - spark-team-c + podMetricsEndpoints: + - port: "spark-ui" + interval: 30s + path: /metrics/driver/prometheus/ + - port: "spark-ui" + interval: 30s + path: /metrics/executors/prometheus/ alertmanager: enabled: false