feat: Add spark benchmark test data generation changes (#694)

awslabs · Nov 4, 2024 · c635266 · c635266
1 parent ad50d57
commit c635266
Show file tree

Hide file tree

Showing 6 changed files with 151 additions and 47 deletions.
diff --git a/analytics/terraform/spark-k8s-operator/eks.tf b/analytics/terraform/spark-k8s-operator/eks.tf
@@ -175,9 +175,9 @@ module "eks" {
       # Node group will be created with zero instances when you deploy the blueprint.
       # You can change the min_size and desired_size to 6 instances
       # desired_size might not be applied through terrafrom once the node group is created so this needs to be adjusted in AWS Console.
-      min_size     = 0 # Change min and desired to 6 for running benchmarks
+      min_size     = var.spark_benchmark_ssd_min_size # Change min and desired to 6 for running benchmarks
       max_size     = 8
-      desired_size = 0 # Change min and desired to 6 for running benchmarks
+      desired_size = var.spark_benchmark_ssd_desired_size # Change min and desired to 6 for running benchmarks
 
       instance_types = ["c5d.12xlarge"] # c5d.12xlarge = 2 x 900 NVMe SSD
 

diff --git a/...s/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml b/...s/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-1t.yaml
@@ -1,8 +1,7 @@
 # NOTE: This example requires the following prerequisites before executing the jobs
 # 1. Ensure spark-team-a name space exists
-# 2. replace <S3_BUCKET>  with your bucket name
+# 2. replace `<S3_BUCKET>`  with your bucket name
 
----
 apiVersion: "sparkoperator.k8s.io/v1beta2"
 kind: SparkApplication
 metadata:
@@ -21,22 +20,22 @@ spec:
   mainClass: com.amazonaws.eks.tpcds.DataGeneration
   mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar
   arguments:
-    # TPC-DS data location
-    - "s3a://<S3_BUCKET>/TPCDS-TEST-1TB"
-    # Path to kit in the docker image
-    - "/opt/tpcds-kit/tools"
-    # Data Format
-    - "parquet"
-    # Scale factor (in GB) - S3 output size shows  309.4GB for 1000GB Input
-    - "1000"
-    # Generate data num partitions
-    - "200"
-    # Create the partitioned fact tables
-    - "true"
-    # Shuffle to get partitions coalesced into single files.
-    - "true"
-    # Logging set to WARN
-    - "true"
+  # TPC-DS data location
+  - "s3a://<S3_BUCKET>/TPCDS-TEST-1TB"
+  # Path to kit in the docker image
+  - "/opt/tpcds-kit/tools"
+  # Data Format
+  - "parquet"
+  # Scale factor (in GB) - S3 output size shows  309.4GB for 1000GB Input
+  - "1000"
+  # Generate data num partitions
+  - "200"
+  # Create the partitioned fact tables
+  - "true"
+  # Shuffle to get partitions coalesced into single files.
+  - "true"
+  # Logging set to WARN
+  - "true"
   sparkConf:
     # Expose Spark metrics for Prometheus
     "spark.ui.prometheus.enabled": "true"
@@ -82,7 +81,7 @@ spec:
     "spark.hadoop.fs.s3a.connection.maximum": "200"
     "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2"
     "spark.kubernetes.executor.podNamePrefix": "oss-data-gen"
-    "spark.sql.shuffle.partitions": "2000"  # Adjust according to your job size
+    "spark.sql.shuffle.partitions": "2000" # Adjust according to your job size
     # "spark.hadoop.fs.s3a.committer.staging.conflict-mode": "append"
     # Data writing and shuffle tuning
     "spark.shuffle.file.buffer": "1m"
@@ -111,47 +110,47 @@ spec:
     securityContext:
       runAsUser: 185
     volumeMounts:
-      - name: spark-local-dir-1
-        mountPath: /data1
+    - name: spark-local-dir-1
+      mountPath: /data1
     env:
-      - name: JAVA_HOME
-        value: "/opt/java/openjdk"
+    - name: JAVA_HOME
+      value: "/opt/java/openjdk"
     initContainers:
-      - name: volume-permission
-        image: public.ecr.aws/docker/library/busybox
-        command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
-        volumeMounts:
-          - name: spark-local-dir-1
-            mountPath: /data1
+    - name: volume-permission
+      image: public.ecr.aws/docker/library/busybox
+      command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
+      volumeMounts:
+      - name: spark-local-dir-1
+        mountPath: /data1
     nodeSelector:
-      NodeGroupType: SparkComputeOptimized
+      NodeGroupType: spark_benchmark_ssd
   executor:
     cores: 11
     # The maximum memory size of the container to the running executor is determined by the sum of
     #  spark.executor.memoryoverHead, spark.executor.memory, spark.memory.offHeap.size, spark.executor.pyspark.memory
     memory: "15g"
     memoryOverhead: "4g"
-    instances: 26
+    instances: 22
     serviceAccount: spark-team-a
     securityContext:
       runAsUser: 185
     volumeMounts:
+    - name: spark-local-dir-1
+      mountPath: /data1
+    initContainers:
+    - name: volume-permission
+      image: public.ecr.aws/docker/library/busybox
+      command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
+      volumeMounts:
       - name: spark-local-dir-1
         mountPath: /data1
-    initContainers:
-      - name: volume-permission
-        image: public.ecr.aws/docker/library/busybox
-        command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
-        volumeMounts:
-          - name: spark-local-dir-1
-            mountPath: /data1
     env:
-      - name: JAVA_HOME
-        value: "/opt/java/openjdk"
+    - name: JAVA_HOME
+      value: "/opt/java/openjdk"
     nodeSelector:
-      NodeGroupType: SparkComputeOptimized
+      NodeGroupType: spark_benchmark_ssd
   volumes:
-    - name: spark-local-dir-1
-      hostPath:
-        path: "/mnt/k8s-disks/0"
-        type: DirectoryOrCreate
+  - name: spark-local-dir-1
+    hostPath:
+      path: "/mnt/k8s-disks/0"
+      type: DirectoryOrCreate
diff --git a/analytics/terraform/spark-k8s-operator/variables.tf b/analytics/terraform/spark-k8s-operator/variables.tf
@@ -77,3 +77,15 @@ variable "kms_key_admin_roles" {
   type        = list(string)
   default     = []
 }
+
+variable "spark_benchmark_ssd_min_size" {
+  description = "Minimum size for nodegroup of c5d 12xlarge instances to run data generation for Spark benchmark"
+  type        = number
+  default     = 0
+}
+
+variable "spark_benchmark_ssd_desired_size" {
+  description = "Desired size for nodegroup of c5d 12xlarge instances to run data generation for Spark benchmark"
+  type        = number
+  default     = 0
+}
diff --git a/website/docs/benchmarks/spark-operator-benchmark/_category_.json b/website/docs/benchmarks/spark-operator-benchmark/_category_.json
@@ -0,0 +1,7 @@
+{
+  "label": "Spark Benchmarks",
+  "position": 2,
+  "link": {
+    "type": "generated-index"
+  }
+}
diff --git a/website/docs/benchmarks/spark-operator-benchmark/data-generation.md b/website/docs/benchmarks/spark-operator-benchmark/data-generation.md
@@ -0,0 +1,70 @@
+---
+sidebar_position: 2
+sidebar_label: Data Generation
+---
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import CollapsibleContent from '../../../src/components/CollapsibleContent';
+
+# Data Generation for Running Spark Benchmark Tests on Amazon EKS
+
+The following guide provides instructions on how to generate the data set for running the TPCDS benchmark tests for Spark.
+
+<CollapsibleContent header={<h2><span>Deploying the Solution</span></h2>}>
+
+In this [example](https://github.com/awslabs/data-on-eks/tree/main/analytics/terraform/spark-k8s-operator), you will provision the following resources required to run Spark Jobs with open source Spark Operator and Apache YuniKorn.
+
+This example deploys an EKS Cluster running the Spark K8s Operator into a new VPC.
+
+- Creates a new sample VPC, 2 Private Subnets and 2 Public Subnets
+- Creates Internet gateway for Public Subnets and NAT Gateway for Private Subnets
+- Creates EKS Cluster Control plane with public endpoint (for demo reasons only) with core managed node group, on-demand node group and Spot node group for Spark workloads.
+- Deploys Metrics server, Cluster Autoscaler, Spark-k8s-operator, Apache Yunikorn, Karpenter, Grafana, AMP and Prometheus server.
+
+### Prerequisites
+
+Ensure that you have installed the following tools on your machine.
+
+1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html)
+2. [kubectl](https://Kubernetes.io/docs/tasks/tools/)
+3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli)
+
+### Deploy
+
+Clone the repository.
+
+```bash
+git clone https://github.com/awslabs/data-on-eks.git
+cd data-on-eks
+export DOEKS_HOME=$(pwd)
+```
+
+If DOEKS_HOME is ever unset, you can always set it manually using `export
+DATA_ON_EKS=$(pwd)` from your data-on-eks directory.
+
+Export the following environment variables to set the minimum and desired number of ssd enabled c5d 12xlarge instances. In our tests, we've set both of these to `6`. Please adjust the number of instances as per your requirement and set up.
+
+```bash
+export TF_VAR_spark_benchmark_ssd_min_size=6
+export TF_VAR_spark_benchmark_ssd_desired_size=6
+```
+
+Navigate into one of the example directories and run `install.sh` script.
+
+```bash
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator
+chmod +x install.sh
+./install.sh
+```
+
+Now create an S3_BUCKET variable that holds the name of the bucket created
+during the install. This bucket will be used in later examples to store output
+data. If S3_BUCKET is ever unset, you can run the following commands again.
+
+```bash
+export S3_BUCKET=$(terraform output -raw s3_bucket_id_spark_history_server)
+echo $S3_BUCKET
+```
+
+</CollapsibleContent>
+
diff --git a/website/docs/benchmarks/spark-operator-benchmark/spark-operator-eks-benchmark.md b/website/docs/benchmarks/spark-operator-benchmark/spark-operator-eks-benchmark.md
@@ -0,0 +1,16 @@
+---
+sidebar_position: 1
+sidebar_label: Introduction to Spark Benchmarks
+---
+
+# Introduction to Spark Benchmarks on Amazon EKS 🚀
+
+This guide walks you through running Apache Spark benchmark tests on Amazon EKS, AWS's managed Kubernetes service. Benchmark tests help evaluate and optimize Spark workloads on EKS comparing benchmark results run across different EC2 instance families of Graviton instances, especially when scaling for performance, cost efficiency, and reliability.
+Key Features 📈
+
+- Data Generation for the benchmark tests
+- Benchmark Test Execution on Different generation of Graviton Instances (r6g, r7g, r8g)
+- Benchmark Results
+- Customizable Benchmarks to suit your workloads
+- Autoscaling and Cost Optimization Strategies
+