Skip to content

Commit

Permalink
feat: Add spark benchmark test data generation changes (#694)
Browse files Browse the repository at this point in the history
  • Loading branch information
ratnopamc authored Nov 4, 2024
1 parent ad50d57 commit c635266
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 47 deletions.
4 changes: 2 additions & 2 deletions analytics/terraform/spark-k8s-operator/eks.tf
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,9 @@ module "eks" {
# Node group will be created with zero instances when you deploy the blueprint.
# You can change the min_size and desired_size to 6 instances
# desired_size might not be applied through terrafrom once the node group is created so this needs to be adjusted in AWS Console.
min_size = 0 # Change min and desired to 6 for running benchmarks
min_size = var.spark_benchmark_ssd_min_size # Change min and desired to 6 for running benchmarks
max_size = 8
desired_size = 0 # Change min and desired to 6 for running benchmarks
desired_size = var.spark_benchmark_ssd_desired_size # Change min and desired to 6 for running benchmarks

instance_types = ["c5d.12xlarge"] # c5d.12xlarge = 2 x 900 NVMe SSD

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# NOTE: This example requires the following prerequisites before executing the jobs
# 1. Ensure spark-team-a name space exists
# 2. replace <S3_BUCKET> with your bucket name
# 2. replace `<S3_BUCKET>` with your bucket name

---
apiVersion: "sparkoperator.k8s.io/v1beta2"
kind: SparkApplication
metadata:
Expand All @@ -21,22 +20,22 @@ spec:
mainClass: com.amazonaws.eks.tpcds.DataGeneration
mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar
arguments:
# TPC-DS data location
- "s3a://<S3_BUCKET>/TPCDS-TEST-1TB"
# Path to kit in the docker image
- "/opt/tpcds-kit/tools"
# Data Format
- "parquet"
# Scale factor (in GB) - S3 output size shows 309.4GB for 1000GB Input
- "1000"
# Generate data num partitions
- "200"
# Create the partitioned fact tables
- "true"
# Shuffle to get partitions coalesced into single files.
- "true"
# Logging set to WARN
- "true"
# TPC-DS data location
- "s3a://<S3_BUCKET>/TPCDS-TEST-1TB"
# Path to kit in the docker image
- "/opt/tpcds-kit/tools"
# Data Format
- "parquet"
# Scale factor (in GB) - S3 output size shows 309.4GB for 1000GB Input
- "1000"
# Generate data num partitions
- "200"
# Create the partitioned fact tables
- "true"
# Shuffle to get partitions coalesced into single files.
- "true"
# Logging set to WARN
- "true"
sparkConf:
# Expose Spark metrics for Prometheus
"spark.ui.prometheus.enabled": "true"
Expand Down Expand Up @@ -82,7 +81,7 @@ spec:
"spark.hadoop.fs.s3a.connection.maximum": "200"
"spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2"
"spark.kubernetes.executor.podNamePrefix": "oss-data-gen"
"spark.sql.shuffle.partitions": "2000" # Adjust according to your job size
"spark.sql.shuffle.partitions": "2000" # Adjust according to your job size
# "spark.hadoop.fs.s3a.committer.staging.conflict-mode": "append"
# Data writing and shuffle tuning
"spark.shuffle.file.buffer": "1m"
Expand Down Expand Up @@ -111,47 +110,47 @@ spec:
securityContext:
runAsUser: 185
volumeMounts:
- name: spark-local-dir-1
mountPath: /data1
- name: spark-local-dir-1
mountPath: /data1
env:
- name: JAVA_HOME
value: "/opt/java/openjdk"
- name: JAVA_HOME
value: "/opt/java/openjdk"
initContainers:
- name: volume-permission
image: public.ecr.aws/docker/library/busybox
command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
volumeMounts:
- name: spark-local-dir-1
mountPath: /data1
- name: volume-permission
image: public.ecr.aws/docker/library/busybox
command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
volumeMounts:
- name: spark-local-dir-1
mountPath: /data1
nodeSelector:
NodeGroupType: SparkComputeOptimized
NodeGroupType: spark_benchmark_ssd
executor:
cores: 11
# The maximum memory size of the container to the running executor is determined by the sum of
# spark.executor.memoryoverHead, spark.executor.memory, spark.memory.offHeap.size, spark.executor.pyspark.memory
memory: "15g"
memoryOverhead: "4g"
instances: 26
instances: 22
serviceAccount: spark-team-a
securityContext:
runAsUser: 185
volumeMounts:
- name: spark-local-dir-1
mountPath: /data1
initContainers:
- name: volume-permission
image: public.ecr.aws/docker/library/busybox
command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
volumeMounts:
- name: spark-local-dir-1
mountPath: /data1
initContainers:
- name: volume-permission
image: public.ecr.aws/docker/library/busybox
command: ['sh', '-c', 'mkdir -p /data1; chown -R 185:185 /data1']
volumeMounts:
- name: spark-local-dir-1
mountPath: /data1
env:
- name: JAVA_HOME
value: "/opt/java/openjdk"
- name: JAVA_HOME
value: "/opt/java/openjdk"
nodeSelector:
NodeGroupType: SparkComputeOptimized
NodeGroupType: spark_benchmark_ssd
volumes:
- name: spark-local-dir-1
hostPath:
path: "/mnt/k8s-disks/0"
type: DirectoryOrCreate
- name: spark-local-dir-1
hostPath:
path: "/mnt/k8s-disks/0"
type: DirectoryOrCreate
12 changes: 12 additions & 0 deletions analytics/terraform/spark-k8s-operator/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,15 @@ variable "kms_key_admin_roles" {
type = list(string)
default = []
}

variable "spark_benchmark_ssd_min_size" {
description = "Minimum size for nodegroup of c5d 12xlarge instances to run data generation for Spark benchmark"
type = number
default = 0
}

variable "spark_benchmark_ssd_desired_size" {
description = "Desired size for nodegroup of c5d 12xlarge instances to run data generation for Spark benchmark"
type = number
default = 0
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"label": "Spark Benchmarks",
"position": 2,
"link": {
"type": "generated-index"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
---
sidebar_position: 2
sidebar_label: Data Generation
---
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import CollapsibleContent from '../../../src/components/CollapsibleContent';

# Data Generation for Running Spark Benchmark Tests on Amazon EKS

The following guide provides instructions on how to generate the data set for running the TPCDS benchmark tests for Spark.

<CollapsibleContent header={<h2><span>Deploying the Solution</span></h2>}>

In this [example](https://github.com/awslabs/data-on-eks/tree/main/analytics/terraform/spark-k8s-operator), you will provision the following resources required to run Spark Jobs with open source Spark Operator and Apache YuniKorn.

This example deploys an EKS Cluster running the Spark K8s Operator into a new VPC.

- Creates a new sample VPC, 2 Private Subnets and 2 Public Subnets
- Creates Internet gateway for Public Subnets and NAT Gateway for Private Subnets
- Creates EKS Cluster Control plane with public endpoint (for demo reasons only) with core managed node group, on-demand node group and Spot node group for Spark workloads.
- Deploys Metrics server, Cluster Autoscaler, Spark-k8s-operator, Apache Yunikorn, Karpenter, Grafana, AMP and Prometheus server.

### Prerequisites

Ensure that you have installed the following tools on your machine.

1. [aws cli](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html)
2. [kubectl](https://Kubernetes.io/docs/tasks/tools/)
3. [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli)

### Deploy

Clone the repository.

```bash
git clone https://github.com/awslabs/data-on-eks.git
cd data-on-eks
export DOEKS_HOME=$(pwd)
```

If DOEKS_HOME is ever unset, you can always set it manually using `export
DATA_ON_EKS=$(pwd)` from your data-on-eks directory.

Export the following environment variables to set the minimum and desired number of ssd enabled c5d 12xlarge instances. In our tests, we've set both of these to `6`. Please adjust the number of instances as per your requirement and set up.

```bash
export TF_VAR_spark_benchmark_ssd_min_size=6
export TF_VAR_spark_benchmark_ssd_desired_size=6
```

Navigate into one of the example directories and run `install.sh` script.

```bash
cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator
chmod +x install.sh
./install.sh
```

Now create an S3_BUCKET variable that holds the name of the bucket created
during the install. This bucket will be used in later examples to store output
data. If S3_BUCKET is ever unset, you can run the following commands again.

```bash
export S3_BUCKET=$(terraform output -raw s3_bucket_id_spark_history_server)
echo $S3_BUCKET
```

</CollapsibleContent>

Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
sidebar_position: 1
sidebar_label: Introduction to Spark Benchmarks
---

# Introduction to Spark Benchmarks on Amazon EKS 🚀

This guide walks you through running Apache Spark benchmark tests on Amazon EKS, AWS's managed Kubernetes service. Benchmark tests help evaluate and optimize Spark workloads on EKS comparing benchmark results run across different EC2 instance families of Graviton instances, especially when scaling for performance, cost efficiency, and reliability.
Key Features 📈

- Data Generation for the benchmark tests
- Benchmark Test Execution on Different generation of Graviton Instances (r6g, r7g, r8g)
- Benchmark Results
- Customizable Benchmarks to suit your workloads
- Autoscaling and Cost Optimization Strategies

0 comments on commit c635266

Please sign in to comment.