From 2d16954217029d5183cd973b90c0eca577333ccc Mon Sep 17 00:00:00 2001 From: Manabu McCloskey Date: Thu, 9 Jan 2025 19:04:04 +0000 Subject: [PATCH] fix spelling, add clean up step, add disclaimer Signed-off-by: Manabu McCloskey --- .../terraform/spark-k8s-operator/addons.tf | 2 +- .../examples/s3-tables/README.md | 53 ++++++++++++++----- .../s3-tables/s3table-iceberg-pyspark.py | 2 +- .../s3-tables/s3table-spark-operator.yaml | 7 ++- 4 files changed, 49 insertions(+), 15 deletions(-) diff --git a/analytics/terraform/spark-k8s-operator/addons.tf b/analytics/terraform/spark-k8s-operator/addons.tf index eb12838bd..2408bf14a 100644 --- a/analytics/terraform/spark-k8s-operator/addons.tf +++ b/analytics/terraform/spark-k8s-operator/addons.tf @@ -652,7 +652,7 @@ resource "aws_secretsmanager_secret_version" "grafana" { resource "aws_iam_policy" "s3tables_policy" { name_prefix = "${local.name}-s3tables" path = "/" - description = "S3Tables Metdata access for Nodes" + description = "S3Tables Metadata access for Nodes" policy = jsonencode({ Version = "2012-10-17" diff --git a/analytics/terraform/spark-k8s-operator/examples/s3-tables/README.md b/analytics/terraform/spark-k8s-operator/examples/s3-tables/README.md index 43110b10a..b57b20430 100644 --- a/analytics/terraform/spark-k8s-operator/examples/s3-tables/README.md +++ b/analytics/terraform/spark-k8s-operator/examples/s3-tables/README.md @@ -6,7 +6,7 @@ This guide provides step-by-step instructions for setting up and running a Spark - Latest version of AWS CLI installed (must include S3Tables API support) -## Step1: Deploy Spark Cluster on EKS +## Step 1: Deploy Spark Cluster on EKS Follow the steps to deploy Spark Cluster on EKS @@ -14,7 +14,7 @@ Follow the steps to deploy Spark Cluster on EKS Once your cluster is up and running, proceed with the following steps to execute a sample Spark job using S3Tables. -## Step2: Create Test Data for the job +## Step 2: Create Test Data for the job Navigate to the example directory and Generate sample data: @@ -25,7 +25,7 @@ cd analytics/terraform/spark-k8s-operator/examples/s3-tables This will create a file called `employee_data.csv` locally with 100 records. Modify the script to adjust the number of records as needed. -## Step3: Upload Test Input data to your S3 Bucket +## Step 3: Upload Test Input data to your S3 Bucket Replace `` with the name of the S3 bucket created by your blueprint and run the below command. @@ -33,7 +33,7 @@ Replace `` with the name of the S3 bucket created by your bluepr aws s3 cp employee_data.csv s3:///s3table-example/input/ ``` -## Step4: Upload PySpark Script to S3 Bucket +## Step 4: Upload PySpark Script to S3 Bucket Replace `` with the name of the S3 bucket created by your blueprint and run the below command to upload sample Spark job to S3 buckets. @@ -41,7 +41,7 @@ Replace `` with the name of the S3 bucket created by your blueprint a aws s3 cp s3table-iceberg-pyspark.py s3:///s3table-example/scripts/ ``` -## Step5: Create S3Table +## Step 5: Create S3Table Replace and with desired names. @@ -53,21 +53,22 @@ aws s3tables create-table-bucket \ Make note of the S3TABLE ARN generated by this command. -## Step6: Update Spark Operator YAML File +## Step 6: Update Spark Operator YAML File - Open `s3table-spark-operator.yaml` file in your preferred text editor. - - Replace `` with your S3 bucket created by this blueprint(Check Terraform outputs). S3 Bucket where you copied Test Data and Smaple spark job in the above steps. - - REPLACE `` with actaul S3 Table ARN. + - Replace `` with your S3 bucket created by this blueprint(Check Terraform outputs). S3 Bucket where you copied test data and sample spark job in the above steps. + - REPLACE `` with your S3 Table ARN. -## Step7: Execute Spark Job +## Step 7: Execute Spark Job Apply the updated YAML file to your Kubernetes cluster to submit the Spark Job. ```sh +cd analytics/terraform/spark-k8s-operator/examples/s3-tables kubectl apply -f s3table-spark-operator.yaml ``` -## Step8: Verify the Spark Driver log for the output +## Step 8: Verify the Spark Driver log for the output Check the Spark driver logs to verify job progress and output: @@ -75,7 +76,7 @@ Check the Spark driver logs to verify job progress and output: kubectl logs -n spark-team-a ``` -## Step9: Verify the S3Table using S3Table API +## Step 9: Verify the S3Table using S3Table API Use the S3Table API to confirm the table was created successfully. Just replace the `` and run the command. @@ -132,7 +133,35 @@ This command provides information about Iceberg compaction, snapshot management, } ``` -## Conclusion +## Step10: Clean up + +Delete the table. + +```bash +aws s3tables delete-table \ + --namespace doeks_namespace \ + --table-bucket-arn ${S3TABLE_ARN} \ + --name employee_s3_table +``` + +Delete the namespace. + +```bash +aws s3tables delete-namespace \ + --namespace doeks_namespace \ + --table-bucket-arn ${S3TABLE_ARN} +``` + +Finally, delete the bucket table + +```bash +aws s3tables delete-table-bucket \ + --region "" \ + --table-bucket-arn ${S3TABLE_ARN} +``` + + +# Conclusion You have successfully set up and run a Spark job on Amazon EKS using S3Table for data storage. This setup provides a scalable and efficient way to process large datasets using Spark on Kubernetes with the added benefits of S3Table's data management capabilities. For more advanced usage, refer to the official AWS documentation on S3Table and Spark on EKS. diff --git a/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-iceberg-pyspark.py b/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-iceberg-pyspark.py index 976ded528..b1127050f 100644 --- a/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-iceberg-pyspark.py +++ b/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-iceberg-pyspark.py @@ -85,7 +85,7 @@ def main(args): print(f"DataFrame count: {iceberg_data_df.count()}") # List the table snapshots - logger.info("List the s3table snaphot versions:") + logger.info("List the s3table snapshot versions:") spark.sql(f"SELECT * FROM {full_table_name}.history LIMIT 10").show() # Stop Spark session diff --git a/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-spark-operator.yaml b/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-spark-operator.yaml index 9b32deec9..7c78cd965 100644 --- a/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-spark-operator.yaml +++ b/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-spark-operator.yaml @@ -1,6 +1,6 @@ # Pre-requisite before running this job # Replace with your S3 bucket created by this blueprint(Check Terraform outputs) -# REPLACE with actaul S3 Table ARN +# REPLACE with actual S3 Table ARN --- apiVersion: "sparkoperator.k8s.io/v1beta2" kind: SparkApplication @@ -14,6 +14,11 @@ spec: type: Python sparkVersion: "3.5.3" mode: cluster + # CAUTION: Unsupported test image + # This image is created solely for testing and reference purposes. + # Before use, please: + # 1. Review the Dockerfile used to create this image + # 2. Create your own image that meets your organization's security requirements image: "public.ecr.aws/data-on-eks/spark:3.5.3-scala2.12-java17-python3-ubuntu-s3table0.1.3-iceberg1.6.1" imagePullPolicy: IfNotPresent mainApplicationFile: "s3a:///s3table-example/scripts/s3table-iceberg-pyspark.py"