From a8cd7fac5dba0ed546ae60f43cac88c7f62810d0 Mon Sep 17 00:00:00 2001 From: alanty <45295374+alanty@users.noreply.github.com> Date: Fri, 31 Jan 2025 11:54:09 -0800 Subject: [PATCH] feat: Add Karpenter and EMR Spark Dashboards to spark-operator (#738) --- .../terraform/spark-k8s-operator/README.md | 8 +++--- .../kube-prometheus-amp-enable.yaml | 26 +++++++++++++++++-- .../helm-values/kube-prometheus.yaml | 26 +++++++++++++++++-- 3 files changed, 52 insertions(+), 8 deletions(-) diff --git a/analytics/terraform/spark-k8s-operator/README.md b/analytics/terraform/spark-k8s-operator/README.md index 0117392dd..acb2f6614 100644 --- a/analytics/terraform/spark-k8s-operator/README.md +++ b/analytics/terraform/spark-k8s-operator/README.md @@ -79,17 +79,17 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.31"` | no | -| [eks\_data\_plane\_subnet\_secondary\_cidr](#input\_eks\_data\_plane\_subnet\_secondary\_cidr) | Secondary CIDR blocks. 32766 IPs per Subnet per Subnet/AZ for EKS Node and Pods | `list(string)` |
[| no | +| [eks\_data\_plane\_subnet\_secondary\_cidr](#input\_eks\_data\_plane\_subnet\_secondary\_cidr) | Secondary CIDR blocks. 32766 IPs per Subnet per Subnet/AZ for EKS Node and Pods | `list(string)` |
"100.64.0.0/17",
"100.64.128.0/17"
]
[| no | | [enable\_amazon\_prometheus](#input\_enable\_amazon\_prometheus) | Enable AWS Managed Prometheus service | `bool` | `true` | no | | [enable\_jupyterhub](#input\_enable\_jupyterhub) | Enable Jupyter Hub | `bool` | `false` | no | | [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `bool` | `false` | no | | [enable\_yunikorn](#input\_enable\_yunikorn) | Enable Apache YuniKorn Scheduler | `bool` | `false` | no | | [kms\_key\_admin\_roles](#input\_kms\_key\_admin\_roles) | list of role ARNs to add to the KMS policy | `list(string)` | `[]` | no | | [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"spark-operator-doeks"` | no | -| [private\_subnets](#input\_private\_subnets) | Private Subnets CIDRs. 254 IPs per Subnet/AZ for Private NAT + NLB + Airflow + EC2 Jumphost etc. | `list(string)` |
"100.64.0.0/17",
"100.64.128.0/17"
]
[| no | -| [public\_subnets](#input\_public\_subnets) | Public Subnets CIDRs. 62 IPs per Subnet/AZ | `list(string)` |
"10.1.1.0/24",
"10.1.2.0/24"
]
[| no | +| [private\_subnets](#input\_private\_subnets) | Private Subnets CIDRs. 254 IPs per Subnet/AZ for Private NAT + NLB + Airflow + EC2 Jumphost etc. | `list(string)` |
"10.1.0.0/26",
"10.1.0.64/26"
]
[| no | +| [public\_subnets](#input\_public\_subnets) | Public Subnets CIDRs. 62 IPs per Subnet/AZ | `list(string)` |
"10.1.1.0/24",
"10.1.2.0/24"
]
[| no | | [region](#input\_region) | Region | `string` | `"us-west-2"` | no | -| [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` |
"10.1.0.0/26",
"10.1.0.64/26"
]
[| no | +| [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` |
"100.64.0.0/16"
]
[| no | | [spark\_benchmark\_ssd\_desired\_size](#input\_spark\_benchmark\_ssd\_desired\_size) | Desired size for nodegroup of c5d 12xlarge instances to run data generation for Spark benchmark | `number` | `0` | no | | [spark\_benchmark\_ssd\_min\_size](#input\_spark\_benchmark\_ssd\_min\_size) | Minimum size for nodegroup of c5d 12xlarge instances to run data generation for Spark benchmark | `number` | `0` | no | | [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR. This should be a valid private (RFC 1918) CIDR range | `string` | `"10.1.0.0/16"` | no | diff --git a/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus-amp-enable.yaml b/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus-amp-enable.yaml index 23d34ced5..3ab96f5fe 100644 --- a/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus-amp-enable.yaml +++ b/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus-amp-enable.yaml @@ -53,9 +53,11 @@ prometheus: names: - karpenter relabel_configs: - - source_labels: [__meta_kubernetes_endpoint_port_name] - regex: http-metrics + - source_labels: + - __meta_kubernetes_endpoints_name + - __meta_kubernetes_endpoint_port_name action: keep + regex: karpenter;http-metrics # Monitors for Spark Jobs additionalPodMonitors: - name: "spark-job-monitoring" @@ -98,3 +100,23 @@ grafana: type: prometheus isDefault: false url: ${amp_url} + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + dashboards: + default: + karpenter-capacity-dashboard: + url: https://karpenter.sh/v1.2/getting-started/getting-started-with-karpenter/karpenter-capacity-dashboard.json + karpenter-performance-dashboard: + url: https://karpenter.sh/v1.2/getting-started/getting-started-with-karpenter/karpenter-performance-dashboard.json + spark-job-dashboard: + url: https://raw.githubusercontent.com/awslabs/data-on-eks/refs/heads/main/analytics/terraform/emr-eks-karpenter/emr-grafana-dashboard/emr-eks-grafana-dashboard.json diff --git a/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus.yaml b/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus.yaml index 661d4a193..c4e37660e 100644 --- a/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus.yaml +++ b/analytics/terraform/spark-k8s-operator/helm-values/kube-prometheus.yaml @@ -40,9 +40,11 @@ prometheus: names: - karpenter relabel_configs: - - source_labels: [__meta_kubernetes_endpoint_port_name] - regex: http-metrics + - source_labels: + - __meta_kubernetes_endpoints_name + - __meta_kubernetes_endpoint_port_name action: keep + regex: karpenter;http-metrics # Monitors for Spark Jobs additionalPodMonitors: - name: "spark-job-monitoring" @@ -69,3 +71,23 @@ alertmanager: grafana: enabled: true defaultDashboardsEnabled: true + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + dashboards: + default: + karpenter-capacity-dashboard: + url: https://karpenter.sh/v1.2/getting-started/getting-started-with-karpenter/karpenter-capacity-dashboard.json + karpenter-performance-dashboard: + url: https://karpenter.sh/v1.2/getting-started/getting-started-with-karpenter/karpenter-performance-dashboard.json + spark-job-dashboard: + url: https://raw.githubusercontent.com/awslabs/data-on-eks/refs/heads/main/analytics/terraform/emr-eks-karpenter/emr-grafana-dashboard/emr-eks-grafana-dashboard.json
"100.64.0.0/16"
]