diff --git a/analytics/terraform/spark-k8s-operator/README.md b/analytics/terraform/spark-k8s-operator/README.md
index 6d79d5a72..0117392dd 100644
--- a/analytics/terraform/spark-k8s-operator/README.md
+++ b/analytics/terraform/spark-k8s-operator/README.md
@@ -31,6 +31,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
| [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 20.26 |
| [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 |
| [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | 1.34 |
+| [jupyterhub\_single\_user\_irsa](#module\_jupyterhub\_single\_user\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.52.0 |
| [s3\_bucket](#module\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 |
| [spark\_team\_irsa](#module\_spark\_team\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 |
| [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | ~> 5.0 |
@@ -53,8 +54,11 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
| [kubernetes_annotations.gp2_default](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/annotations) | resource |
| [kubernetes_cluster_role.spark_role](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource |
| [kubernetes_cluster_role_binding.spark_role_binding](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role_binding) | resource |
+| [kubernetes_namespace.jupyterhub](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace) | resource |
| [kubernetes_namespace_v1.spark_team](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource |
+| [kubernetes_secret_v1.jupyterhub_single_user](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource |
| [kubernetes_secret_v1.spark_team](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource |
+| [kubernetes_service_account_v1.jupyterhub_single_user_sa](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource |
| [kubernetes_service_account_v1.spark_team](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource |
| [kubernetes_storage_class.ebs_csi_encrypted_gp3_storage_class](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class) | resource |
| [random_password.grafana](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource |
@@ -77,6 +81,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
| [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.31"` | no |
| [eks\_data\_plane\_subnet\_secondary\_cidr](#input\_eks\_data\_plane\_subnet\_secondary\_cidr) | Secondary CIDR blocks. 32766 IPs per Subnet per Subnet/AZ for EKS Node and Pods | `list(string)` |
[
"100.64.0.0/17",
"100.64.128.0/17"
]
| no |
| [enable\_amazon\_prometheus](#input\_enable\_amazon\_prometheus) | Enable AWS Managed Prometheus service | `bool` | `true` | no |
+| [enable\_jupyterhub](#input\_enable\_jupyterhub) | Enable Jupyter Hub | `bool` | `false` | no |
| [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `bool` | `false` | no |
| [enable\_yunikorn](#input\_enable\_yunikorn) | Enable Apache YuniKorn Scheduler | `bool` | `false` | no |
| [kms\_key\_admin\_roles](#input\_kms\_key\_admin\_roles) | list of role ARNs to add to the KMS policy | `list(string)` | `[]` | no |
diff --git a/analytics/terraform/spark-k8s-operator/addons.tf b/analytics/terraform/spark-k8s-operator/addons.tf
index 2408bf14a..a74683682 100644
--- a/analytics/terraform/spark-k8s-operator/addons.tf
+++ b/analytics/terraform/spark-k8s-operator/addons.tf
@@ -424,6 +424,16 @@ module "eks_data_addons" {
repository_password = data.aws_ecrpublic_authorization_token.token.password
}
+ #---------------------------------------------------------------
+ # JupyterHub Add-on
+ #---------------------------------------------------------------
+ enable_jupyterhub = var.enable_jupyterhub
+ jupyterhub_helm_config = {
+ values = [templatefile("${path.module}/helm-values/jupyterhub-singleuser-values.yaml", {
+ jupyter_single_user_sa_name = var.enable_jupyterhub ? kubernetes_service_account_v1.jupyterhub_single_user_sa[0].metadata[0].name : "no-tused"
+ })]
+ version = "3.3.8"
+ }
}
#---------------------------------------------------------------
@@ -648,6 +658,8 @@ resource "aws_secretsmanager_secret_version" "grafana" {
#---------------------------------------------------------------
# S3Table IAM policy for Karpenter nodes
+# The S3 tables library does not fully support IRSA and Pod Identity as of this writing.
+# We give the node role access to S3tables to work around this limitation.
#---------------------------------------------------------------
resource "aws_iam_policy" "s3tables_policy" {
name_prefix = "${local.name}-s3tables"
@@ -665,7 +677,9 @@ resource "aws_iam_policy" "s3tables_policy" {
"s3tables:GetNamespace",
"s3tables:GetTableBucket",
"s3tables:GetTableBucketMaintenanceConfiguration",
- "s3tables:GetTableBucketPolicy"
+ "s3tables:GetTableBucketPolicy",
+ "s3tables:CreateNamespace",
+ "s3tables:CreateTable"
]
Resource = "arn:aws:s3tables:*:${data.aws_caller_identity.current.account_id}:bucket/*"
},
diff --git a/analytics/terraform/spark-k8s-operator/examples/s3-tables/Dockerfile-S3Table-notebook b/analytics/terraform/spark-k8s-operator/examples/s3-tables/Dockerfile-S3Table-notebook
new file mode 100644
index 000000000..f15b1ec2a
--- /dev/null
+++ b/analytics/terraform/spark-k8s-operator/examples/s3-tables/Dockerfile-S3Table-notebook
@@ -0,0 +1,51 @@
+#--------------------------------------------------------------------------------------------
+# Dockerfile for Apache Spark 3.3.1 with S3A Support on multi-arch platforms (AMD64 & ARM64)
+#--------------------------------------------------------------------------------------------
+# Step1: Create a Private or Public ECR repo from AWS Console or CLI
+# e.g., aws ecr-public create-repository --repository-name spark --region us-east-1
+#---
+# Step2: Docker Login:
+# aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/
+#---
+# Step3: Build multi arch image and push it to ECR:
+# docker buildx build --platform linux/amd64,linux/arm64 -t public.ecr.aws//spark:3.5.3-scala2.12-java17-python3-ubuntu --push .
+#--------------------------------------------------------------------------------------------
+
+# Use the official pyspark notebook base image
+FROM quay.io/jupyter/pyspark-notebook:spark-3.5.3
+
+# Arguments for version control
+ARG HADOOP_VERSION=3.4.1
+ARG PREV_HADOOP_VERSION=3.3.4
+ARG AWS_SDK_VERSION=2.29.45
+ARG ICEBERG_VERSION=1.6.1
+ARG S3_TABLES_VERSION=0.1.3
+ARG NOTEBOOK_USER=1000
+
+# Set environment variables
+ENV HADOOP_DIR=/usr/local/spark-3.5.3-bin-hadoop3
+
+# Set up as root to install dependencies and tools
+USER root
+
+# Remove any old Hadoop libraries to avoid conflicts
+RUN rm -f ${HADOOP_DIR}/jars/hadoop-client-* && \
+ rm -f ${HADOOP_DIR}/jars/hadoop-yarn-server-web-proxy-*.jar
+
+# Add Hadoop AWS connector and related Hadoop dependencies
+RUN cd ${HADOOP_DIR}/jars && \
+ wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -O hadoop-aws-${PREV_HADOOP_VERSION}.jar && \
+ wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-api/${HADOOP_VERSION}/hadoop-client-api-${HADOOP_VERSION}.jar -O hadoop-client-api-${PREV_HADOOP_VERSION}.jar && \
+ wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-runtime/${HADOOP_VERSION}/hadoop-client-runtime-${HADOOP_VERSION}.jar -O hadoop-client-runtime-${PREV_HADOOP_VERSION}.jar && \
+ wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/${HADOOP_VERSION}/hadoop-common-${HADOOP_VERSION}.jar -O hadoop-common-${PREV_HADOOP_VERSION}.jar && \
+ wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-yarn-server-web-proxy/${HADOOP_VERSION}/hadoop-yarn-server-web-proxy-${HADOOP_VERSION}.jar -O hadoop-yarn-server-web-proxy-${PREV_HADOOP_VERSION}.jar
+
+# Add Iceberg, AWS SDK bundle, and S3 Tables Catalog for Iceberg runtime
+RUN cd ${HADOOP_DIR}/jars && \
+ wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_2.12-${ICEBERG_VERSION}.jar && \
+ wget https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar && \
+ wget https://repo1.maven.org/maven2/software/amazon/s3tables/s3-tables-catalog-for-iceberg-runtime/${S3_TABLES_VERSION}/s3-tables-catalog-for-iceberg-runtime-${S3_TABLES_VERSION}.jar
+
+
+# Switch to non-root user for security best practices
+USER ${NOTEBOOK_USER}
diff --git a/analytics/terraform/spark-k8s-operator/examples/s3-tables/README.md b/analytics/terraform/spark-k8s-operator/examples/s3-tables/README.md
index b57b20430..ec8a0d989 100644
--- a/analytics/terraform/spark-k8s-operator/examples/s3-tables/README.md
+++ b/analytics/terraform/spark-k8s-operator/examples/s3-tables/README.md
@@ -1,167 +1,3 @@
-# S3Table with OSS Spark on EKS Guide
+# S3Table with OSS Spark on EKS
-This guide provides step-by-step instructions for setting up and running a Spark job on Amazon EKS using S3Table for data storage.
-
-## Prerequisites
-
-- Latest version of AWS CLI installed (must include S3Tables API support)
-
-## Step 1: Deploy Spark Cluster on EKS
-
-Follow the steps to deploy Spark Cluster on EKS
-
-[Spark Operator on EKS with YuniKorn Scheduler](https://awslabs.github.io/data-on-eks/docs/blueprints/data-analytics/spark-operator-yunikorn#prerequisites)
-
-Once your cluster is up and running, proceed with the following steps to execute a sample Spark job using S3Tables.
-
-## Step 2: Create Test Data for the job
-
-Navigate to the example directory and Generate sample data:
-
-```sh
-cd analytics/terraform/spark-k8s-operator/examples/s3-tables
-./input-data-gen.sh
-```
-
-This will create a file called `employee_data.csv` locally with 100 records. Modify the script to adjust the number of records as needed.
-
-## Step 3: Upload Test Input data to your S3 Bucket
-
-Replace `` with the name of the S3 bucket created by your blueprint and run the below command.
-
-```sh
-aws s3 cp employee_data.csv s3:///s3table-example/input/
-```
-
-## Step 4: Upload PySpark Script to S3 Bucket
-
-Replace `` with the name of the S3 bucket created by your blueprint and run the below command to upload sample Spark job to S3 buckets.
-
-```sh
-aws s3 cp s3table-iceberg-pyspark.py s3:///s3table-example/scripts/
-```
-
-## Step 5: Create S3Table
-
-Replace and with desired names.
-
-```sh
-aws s3tables create-table-bucket \
- --region "" \
- --name ""
-```
-
-Make note of the S3TABLE ARN generated by this command.
-
-## Step 6: Update Spark Operator YAML File
-
- - Open `s3table-spark-operator.yaml` file in your preferred text editor.
- - Replace `` with your S3 bucket created by this blueprint(Check Terraform outputs). S3 Bucket where you copied test data and sample spark job in the above steps.
- - REPLACE `` with your S3 Table ARN.
-
-## Step 7: Execute Spark Job
-
-Apply the updated YAML file to your Kubernetes cluster to submit the Spark Job.
-
-```sh
-cd analytics/terraform/spark-k8s-operator/examples/s3-tables
-kubectl apply -f s3table-spark-operator.yaml
-```
-
-## Step 8: Verify the Spark Driver log for the output
-
-Check the Spark driver logs to verify job progress and output:
-
-```sh
-kubectl logs -n spark-team-a
-```
-
-## Step 9: Verify the S3Table using S3Table API
-
-Use the S3Table API to confirm the table was created successfully. Just replace the `` and run the command.
-
-```sh
-aws s3tables get-table --table-bucket-arn arn:aws:s3tables:us-west-2::bucket/doeks-spark-on-eks-s3table --namespace doeks_namespace --name employee_s3_table
-```
-
-Output looks like below.
-
-```json
-{
- "name": "employee_s3_table",
- "type": "customer",
- "tableARN": "arn:aws:s3tables:us-west-2::bucket/doeks-spark-on-eks-s3table/table/55511111-7a03-4513-b921-e372b0030daf",
- "namespace": [
- "doeks_namespace"
- ],
- "versionToken": "aafc39ddd462690d2a0c",
- "metadataLocation": "s3://55511111-7a03-4513-bumiqc8ihp8rnxymuhyz8t1ammu7ausw2b--table-s3/metadata/00004-62cc4be3-59b5-4647-a78d-1cdf69ec5ed8.metadata.json",
- "warehouseLocation": "s3://55511111-7a03-4513-bumiqc8ihp8rnxymuhyz8t1ammu7ausw2b--table-s3",
- "createdAt": "2025-01-07T22:14:48.689581+00:00",
- "createdBy": "",
- "modifiedAt": "2025-01-09T00:06:09.222917+00:00",
- "ownerAccountId": "",
- "format": "ICEBERG"
-}
-```
-
-Monitor the table maintenance job status:
-
-```sh
-aws s3tables get-table-maintenance-job-status --table-bucket-arn arn:aws:s3tables:us-west-2::bucket/doeks-spark-on-eks-s3table --namespace doeks_namespace --name employee_s3_table
-```
-
-This command provides information about Iceberg compaction, snapshot management, and unreferenced file removal processes.
-
-```json
-{
- "tableARN": "arn:aws:s3tables:us-west-2::bucket/doeks-spark-on-eks-s3table/table/55511111-7a03-4513-b921-e372b0030daf",
- "status": {
- "icebergCompaction": {
- "status": "Successful",
- "lastRunTimestamp": "2025-01-08T01:18:08.857000+00:00"
- },
- "icebergSnapshotManagement": {
- "status": "Successful",
- "lastRunTimestamp": "2025-01-08T22:17:08.811000+00:00"
- },
- "icebergUnreferencedFileRemoval": {
- "status": "Successful",
- "lastRunTimestamp": "2025-01-08T22:17:10.377000+00:00"
- }
- }
-}
-```
-
-## Step10: Clean up
-
-Delete the table.
-
-```bash
-aws s3tables delete-table \
- --namespace doeks_namespace \
- --table-bucket-arn ${S3TABLE_ARN} \
- --name employee_s3_table
-```
-
-Delete the namespace.
-
-```bash
-aws s3tables delete-namespace \
- --namespace doeks_namespace \
- --table-bucket-arn ${S3TABLE_ARN}
-```
-
-Finally, delete the bucket table
-
-```bash
-aws s3tables delete-table-bucket \
- --region "" \
- --table-bucket-arn ${S3TABLE_ARN}
-```
-
-
-# Conclusion
-You have successfully set up and run a Spark job on Amazon EKS using S3Table for data storage. This setup provides a scalable and efficient way to process large datasets using Spark on Kubernetes with the added benefits of S3Table's data management capabilities.
-
-For more advanced usage, refer to the official AWS documentation on S3Table and Spark on EKS.
+** Please see [our website](https://awslabs.github.io/data-on-eks/docs/blueprints/data-analytics/spark-operator-s3tables) for details on how to use this example **
diff --git a/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-iceberg-pyspark.ipynb b/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-iceberg-pyspark.ipynb
new file mode 100644
index 000000000..97b012626
--- /dev/null
+++ b/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-iceberg-pyspark.ipynb
@@ -0,0 +1,251 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "6e966091-0998-449f-9dcd-a2032fa39f36",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import logging\n",
+ "import sys\n",
+ "from datetime import datetime\n",
+ "\n",
+ "from pyspark.sql import SparkSession\n",
+ "\n",
+ "# Logging configuration\n",
+ "formatter = logging.Formatter('[%(asctime)s] %(levelname)s @ line %(lineno)d: %(message)s')\n",
+ "handler = logging.StreamHandler(sys.stdout)\n",
+ "handler.setLevel(logging.INFO)\n",
+ "handler.setFormatter(formatter)\n",
+ "logger = logging.getLogger()\n",
+ "logger.setLevel(logging.INFO)\n",
+ "logger.addHandler(handler)\n",
+ "\n",
+ "\n",
+ "# Application-specific variables\n",
+ "dt_string = datetime.now().strftime(\"%Y_%m_%d_%H_%M_%S\")\n",
+ "AppName = \"EmployeeDataS3TableJob\"\n",
+ "\n",
+ "# Replace S3_BUCKET and ACCOUNT_NUMBER with your own values\n",
+ "input_csv_path = \"s3a:///s3table-example/input/\"\n",
+ "s3table_arn = \"arn:aws:s3tables:us-west-2::bucket/doeks-spark-s3-tables\"\n",
+ "namespace = \"doeks_namespace\"\n",
+ "table_name = \"employee_s3_table\"\n",
+ "full_table_name = f\"s3tablesbucket.{namespace}.{table_name}\"\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "d38e589b-b0ab-41b6-a339-07f5026a7262",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2025-01-13 19:47:18,917] INFO @ line 22: Spark session initialized successfully\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "spark = (SparkSession\n",
+ " .builder\n",
+ " .appName(f\"{AppName}_{dt_string}\")\n",
+ " .config(\"spark.sql.extensions\", \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\")\n",
+ " .config(\"spark.sql.catalog.s3tablesbucket\", \"org.apache.iceberg.spark.SparkCatalog\")\n",
+ " .config(\"spark.sql.catalog.s3tablesbucket.catalog-impl\", \"software.amazon.s3tables.iceberg.S3TablesCatalog\")\n",
+ " .config(\"spark.sql.catalog.s3tablesbucket.warehouse\", s3table_arn)\n",
+ " .config('spark.hadoop.fs.s3.impl', \"org.apache.hadoop.fs.s3a.S3AFileSystem\")\n",
+ " .config(\"spark.sql.defaultCatalog\", \"s3tablesbucket\")\n",
+ " .config(\"spark.hadoop.fs.s3a.connection.timeout\", \"1200000\") \\\n",
+ " .config(\"spark.hadoop.fs.s3a.path.style.access\", \"true\") \\\n",
+ " .config(\"spark.hadoop.fs.s3a.connection.maximum\", \"200\") \\\n",
+ " .config(\"spark.hadoop.fs.s3a.fast.upload\", \"true\") \\\n",
+ " .config(\"spark.hadoop.fs.s3a.readahead.range\", \"256K\") \\\n",
+ " .config(\"spark.hadoop.fs.s3a.input.fadvise\", \"random\") \\\n",
+ " .config(\"spark.hadoop.fs.s3a.aws.credentials.provider.mapping\", \"com.amazonaws.auth.WebIdentityTokenCredentialsProvider=software.amazon.awssdk.auth.credentials.WebIdentityTokenFileCredentialsProvider\") \\\n",
+ " .config(\"spark.hadoop.fs.s3a.aws.credentials.provider\", \"software.amazon.awssdk.auth.credentials.WebIdentityTokenFileCredentialsProvider\")\n",
+ " .getOrCreate())\n",
+ "\n",
+ "spark.sparkContext.setLogLevel(\"DEBUG\")\n",
+ "logger.info(\"Spark session initialized successfully\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "3c157786-d661-44e0-af6a-05b94a9e9b0e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2025-01-13 19:47:18,923] INFO @ line 6: Creating namespace: doeks_namespace\n",
+ "[2025-01-13 19:47:21,093] INFO @ line 10: Reading employee data from input CSV: s3a:///s3table-example/input/\n",
+ "[2025-01-13 19:47:24,560] INFO @ line 13: Previewing employee data schema\n",
+ "root\n",
+ " |-- id: integer (nullable = true)\n",
+ " |-- name: string (nullable = true)\n",
+ " |-- level: string (nullable = true)\n",
+ " |-- salary: double (nullable = true)\n",
+ "\n",
+ "[2025-01-13 19:47:24,564] INFO @ line 16: Previewing first 10 records from the input data\n",
+ "+---+-----------+------+--------+\n",
+ "|id |name |level |salary |\n",
+ "+---+-----------+------+--------+\n",
+ "|1 |Employee_1 |Exec |101000.0|\n",
+ "|2 |Employee_2 |Exec |149000.0|\n",
+ "|3 |Employee_3 |Junior|86000.0 |\n",
+ "|4 |Employee_4 |Exec |147500.0|\n",
+ "|5 |Employee_5 |Exec |74000.0 |\n",
+ "|6 |Employee_6 |Exec |66500.0 |\n",
+ "|7 |Employee_7 |Junior|69500.0 |\n",
+ "|8 |Employee_8 |Exec |116000.0|\n",
+ "|9 |Employee_9 |Mid |56000.0 |\n",
+ "|10 |Employee_10|Exec |186500.0|\n",
+ "+---+-----------+------+--------+\n",
+ "only showing top 10 rows\n",
+ "\n",
+ "[2025-01-13 19:47:24,799] INFO @ line 19: Source data count:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "100"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "namespace = \"doeks_namespace\"\n",
+ "table_name = \"employee_s3_table\"\n",
+ "full_table_name = f\"s3tablesbucket.{namespace}.{table_name}\"\n",
+ "\n",
+ "# Step 1: Create namespace if not exists\n",
+ "logger.info(f\"Creating namespace: {namespace}\")\n",
+ "spark.sql(f\"CREATE NAMESPACE IF NOT EXISTS s3tablesbucket.{namespace}\")\n",
+ "\n",
+ "# Step 2: Read input CSV data\n",
+ "logger.info(f\"Reading employee data from input CSV: {input_csv_path}\")\n",
+ "employee_df = spark.read.csv(input_csv_path, header=True, inferSchema=True)\n",
+ "\n",
+ "logger.info(\"Previewing employee data schema\")\n",
+ "employee_df.printSchema()\n",
+ "\n",
+ "logger.info(\"Previewing first 10 records from the input data\")\n",
+ "employee_df.show(10, truncate=False)\n",
+ "\n",
+ "logger.info(\"Source data count:\")\n",
+ "employee_df.count()\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "0fe0e464-22f2-43e8-8cfe-36e108359a62",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2025-01-13 19:47:25,207] INFO @ line 2: Creating/Replacing and writing data to table: s3tablesbucket.doeks_namespace.employee_s3_table\n",
+ "[2025-01-13 19:47:27,861] INFO @ line 8: Reading data back from Iceberg table: s3tablesbucket.doeks_namespace.employee_s3_table\n",
+ "[2025-01-13 19:47:28,079] INFO @ line 11: Previewing first 10 records from the Iceberg table\n",
+ "+---+-----------+------+--------+\n",
+ "|id |name |level |salary |\n",
+ "+---+-----------+------+--------+\n",
+ "|1 |Employee_1 |Exec |101000.0|\n",
+ "|2 |Employee_2 |Exec |149000.0|\n",
+ "|3 |Employee_3 |Junior|86000.0 |\n",
+ "|4 |Employee_4 |Exec |147500.0|\n",
+ "|5 |Employee_5 |Exec |74000.0 |\n",
+ "|6 |Employee_6 |Exec |66500.0 |\n",
+ "|7 |Employee_7 |Junior|69500.0 |\n",
+ "|8 |Employee_8 |Exec |116000.0|\n",
+ "|9 |Employee_9 |Mid |56000.0 |\n",
+ "|10 |Employee_10|Exec |186500.0|\n",
+ "+---+-----------+------+--------+\n",
+ "only showing top 10 rows\n",
+ "\n",
+ "[2025-01-13 19:47:28,794] INFO @ line 15: Total records in Iceberg table (DataFrame API):\n",
+ "DataFrame count: 100\n",
+ "[2025-01-13 19:47:29,153] INFO @ line 19: List the s3table snapshot versions:\n",
+ "+--------------------+-------------------+---------+-------------------+\n",
+ "| made_current_at| snapshot_id|parent_id|is_current_ancestor|\n",
+ "+--------------------+-------------------+---------+-------------------+\n",
+ "|2025-01-13 19:45:...| 271131297078418895| NULL| false|\n",
+ "|2025-01-13 19:47:...|1268450705309139006| NULL| true|\n",
+ "+--------------------+-------------------+---------+-------------------+\n",
+ "\n",
+ "[2025-01-13 19:47:29,349] INFO @ line 23: Stopping Spark Session\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Step 3: Create or replace table and write data in one operation\n",
+ "logger.info(f\"Creating/Replacing and writing data to table: {full_table_name}\")\n",
+ "(employee_df.writeTo(full_table_name)\n",
+ " .using(\"iceberg\")\n",
+ " .createOrReplace())\n",
+ "\n",
+ "# Step 4: Read data back from the Iceberg table\n",
+ "logger.info(f\"Reading data back from Iceberg table: {full_table_name}\")\n",
+ "iceberg_data_df = spark.read.format(\"iceberg\").load(full_table_name)\n",
+ "\n",
+ "logger.info(\"Previewing first 10 records from the Iceberg table\")\n",
+ "iceberg_data_df.show(10, truncate=False)\n",
+ "\n",
+ "# Count records using both DataFrame API and SQL\n",
+ "logger.info(\"Total records in Iceberg table (DataFrame API):\")\n",
+ "print(f\"DataFrame count: {iceberg_data_df.count()}\")\n",
+ "\n",
+ "# List the table snapshots\n",
+ "logger.info(\"List the s3table snapshot versions:\")\n",
+ "spark.sql(f\"SELECT * FROM {full_table_name}.history LIMIT 10\").show()\n",
+ "\n",
+ "# Stop Spark session\n",
+ "logger.info(\"Stopping Spark Session\")\n",
+ "spark.stop()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a41ac2c9-96a0-420a-b270-8d720a0e094b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/analytics/terraform/spark-k8s-operator/helm-values/jupyterhub-singleuser-values.yaml b/analytics/terraform/spark-k8s-operator/helm-values/jupyterhub-singleuser-values.yaml
new file mode 100644
index 000000000..7fe1b55ab
--- /dev/null
+++ b/analytics/terraform/spark-k8s-operator/helm-values/jupyterhub-singleuser-values.yaml
@@ -0,0 +1,75 @@
+hub:
+ db:
+ pvc:
+ storage: 50Gi
+ storageClassName: gp3
+ authenticatePrometheus: false
+
+proxy:
+ https:
+ enabled: false
+ type: offload
+ service:
+ type: ClusterIP
+
+singleuser:
+ profileList:
+ - display_name: Data Engineering (CPU)
+ description: "PySpark Notebooks | Karpenter AutoScaling"
+ profile_options:
+ image:
+ display_name: "Image"
+ choices:
+ pyspark350:
+ # CAUTION: Unsupported test image
+ # This image is created solely for testing and reference purposes.
+ # Before use, please:
+ # 1. Review the Dockerfile used to create this image (Dockerfile-S3Table-notebook)
+ # 2. Create your own image that meets your organization's security requirements
+ display_name: "PySpark 3.5.3 with S3 Tables support"
+ default: true
+ kubespawner_override:
+ image: public.ecr.aws/data-on-eks/spark:pyspark-notebook-3.5.3-s3table0.1.3
+ pyspark341: # Use this if you'd like to customize the base image yourself.
+ display_name: "PySpark 3.5.3"
+ kubespawner_override:
+ image: quay.io/jupyter/pyspark-notebook:spark-3.5.3
+ kubespawner_override:
+ extra_pod_config:
+ dns_policy: ClusterFirst
+ node_selector:
+ NodeGroupType: "SparkComputeOptimized"
+ karpenter.sh/capacity-type: "on-demand"
+ cpu_guarantee: 2
+ mem_guarantee: 8G
+ cpu_limit: 4
+ mem_limit: 8G
+ cmd: null
+ serviceAccountName: ${jupyter_single_user_sa_name}
+ uid: 0
+ cloudMetadata:
+ # CAUTION: This should be set to true for almost all cases.
+ # This is set to false for s3-tables-catalog to assume the node role, because there is an issue with credentials injections through IRSA in the s3 tables catalog library.
+ blockWithIptables: false
+ extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account
+ securityContext:
+ fsGroup: 100
+
+# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html
+scheduling:
+ userScheduler:
+ enabled: true
+ podPriority:
+ enabled: true
+ userPlaceholder:
+ enabled: false
+ replicas: 1
+prePuller:
+ hook:
+ enabled: false
+ continuous:
+ # NOTE: if used with Karpenter, also add user-placeholders
+ enabled: false
+
+global:
+ safeToShowValues: false
diff --git a/analytics/terraform/spark-k8s-operator/jupyterhub.tf b/analytics/terraform/spark-k8s-operator/jupyterhub.tf
new file mode 100644
index 000000000..5b4c389ed
--- /dev/null
+++ b/analytics/terraform/spark-k8s-operator/jupyterhub.tf
@@ -0,0 +1,53 @@
+#-----------------------------------------------------------------------------------------
+# JupyterHub Single User IRSA Configuration
+#-----------------------------------------------------------------------------------------
+resource "kubernetes_namespace" "jupyterhub" {
+ count = var.enable_jupyterhub ? 1 : 0
+ metadata {
+ name = "jupyterhub"
+ }
+}
+
+module "jupyterhub_single_user_irsa" {
+ count = var.enable_jupyterhub ? 1 : 0
+ source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
+ version = "~> 5.52.0"
+ role_name = "${module.eks.cluster_name}-jupyterhub-single-user-sa"
+
+ role_policy_arns = {
+ policy = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" # Policy needs to be defined based in what you need to give access to your notebook instances.
+ s3tables_policy = aws_iam_policy.s3tables.arn
+ }
+
+ oidc_providers = {
+ main = {
+ provider_arn = module.eks.oidc_provider_arn
+ namespace_service_accounts = ["${kubernetes_namespace.jupyterhub[0].metadata[0].name}:${module.eks.cluster_name}-jupyterhub-single-user"]
+ }
+ }
+}
+
+resource "kubernetes_service_account_v1" "jupyterhub_single_user_sa" {
+ count = var.enable_jupyterhub ? 1 : 0
+ metadata {
+ name = "${module.eks.cluster_name}-jupyterhub-single-user"
+ namespace = kubernetes_namespace.jupyterhub[0].metadata[0].name
+ annotations = { "eks.amazonaws.com/role-arn" : module.jupyterhub_single_user_irsa[0].iam_role_arn }
+ }
+
+ automount_service_account_token = true
+}
+
+resource "kubernetes_secret_v1" "jupyterhub_single_user" {
+ count = var.enable_jupyterhub ? 1 : 0
+ metadata {
+ name = "${module.eks.cluster_name}-jupyterhub-single-user-secret"
+ namespace = kubernetes_namespace.jupyterhub[0].metadata[0].name
+ annotations = {
+ "kubernetes.io/service-account.name" = kubernetes_service_account_v1.jupyterhub_single_user_sa[0].metadata[0].name
+ "kubernetes.io/service-account.namespace" = kubernetes_namespace.jupyterhub[0].metadata[0].name
+ }
+ }
+
+ type = "kubernetes.io/service-account-token"
+}
diff --git a/analytics/terraform/spark-k8s-operator/variables.tf b/analytics/terraform/spark-k8s-operator/variables.tf
index 0ff0b5aeb..995e910fc 100644
--- a/analytics/terraform/spark-k8s-operator/variables.tf
+++ b/analytics/terraform/spark-k8s-operator/variables.tf
@@ -72,6 +72,12 @@ variable "enable_yunikorn" {
type = bool
}
+variable "enable_jupyterhub" {
+ default = false
+ description = "Enable Jupyter Hub"
+ type = bool
+}
+
variable "kms_key_admin_roles" {
description = "list of role ARNs to add to the KMS policy"
type = list(string)
diff --git a/website/docs/blueprints/data-analytics/img/s3tables-jupyter-notebook.png b/website/docs/blueprints/data-analytics/img/s3tables-jupyter-notebook.png
new file mode 100644
index 000000000..c74b4a5df
Binary files /dev/null and b/website/docs/blueprints/data-analytics/img/s3tables-jupyter-notebook.png differ
diff --git a/website/docs/blueprints/data-analytics/img/s3tables-jupyter-select.png b/website/docs/blueprints/data-analytics/img/s3tables-jupyter-select.png
new file mode 100644
index 000000000..c6cf83df3
Binary files /dev/null and b/website/docs/blueprints/data-analytics/img/s3tables-jupyter-select.png differ
diff --git a/website/docs/blueprints/data-analytics/img/s3tables-jupyter-signin.png b/website/docs/blueprints/data-analytics/img/s3tables-jupyter-signin.png
new file mode 100644
index 000000000..239c08e49
Binary files /dev/null and b/website/docs/blueprints/data-analytics/img/s3tables-jupyter-signin.png differ
diff --git a/website/docs/blueprints/data-analytics/spark-operator-s3tables.md b/website/docs/blueprints/data-analytics/spark-operator-s3tables.md
index 41baadf18..315cc9ec3 100644
--- a/website/docs/blueprints/data-analytics/spark-operator-s3tables.md
+++ b/website/docs/blueprints/data-analytics/spark-operator-s3tables.md
@@ -393,6 +393,56 @@ Please note that these policies can further be adjusted and make it more granula
:::
+
+Using S3 Tables with JupyterLab }>
+
+If you'd like to use JupyterLab to work with S3 Tables interactively, this blueprint allows you to deploy JupyterLab single user instances to your cluster.
+
+> :warning: JupyterHub configurations available here are for testing purposes only.
+>
+> :warning: Review the configuration and make necessary changes to meet your security standards.
+
+### Update Terraform variable and apply
+
+```bash
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator
+
+echo 'enable_jupyterhub = true' >> spark-operator.tfvars
+terraform apply -var-file spark-operator.tfvars
+```
+
+### Ensure you have test data available in your S3 bucket
+
+```bash
+cd analytics/terraform/spark-k8s-operator/examples/s3-tables
+./input-data-gen.sh
+aws s3 cp employee_data.csv s3://${S3_BUCKET}/s3table-example/input/
+```
+
+### Access JupyterHub UI and provision a JupyterLab server
+
+1. Port forward proxy service to your local machine.
+
+ ```bash
+ kubectl port-forward svc/proxy-public 8888:80 -n jupyterhub
+ ```
+
+1. Go to [`http://localhost:8888`](http://localhost:8888). Enter any username, leave the password field empty, then click "Sign in".
+
+ ![sign in](./img/s3tables-jupyter-signin.png)
+
+1. Click start. You can also select the upstream PySpark NoteBook image from the drop down list, if you'd like to customize it yourself.
+
+ ![select](./img/s3tables-jupyter-select.png)
+
+1. Copy examples from the [example Jupyter Notebook](https://github.com/awslabs/data-on-eks/blob/main/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-iceberg-pyspark.ipynb) as a starting point to test S3 Tables features interactively.
+
+ **Be sure to update `S3_BUCKET` and `s3table_arn` values in the notebook**
+
+ ![notebook](./img/s3tables-jupyter-notebook.png)
+
+
+
Cleanup}>
:::caution
@@ -424,6 +474,15 @@ aws s3tables delete-table-bucket \
--table-bucket-arn ${S3TABLE_ARN}
```
+## Delete the Jupyter Notebook server
+
+If you created a Jupyter notebook server
+
+```bash
+kubectl delete pods -n jupyterhub -l component=singleuser-server
+```
+
+
## Delete the EKS cluster
This script will cleanup the environment using `-target` option to ensure all the resources are deleted in correct order.