diff --git a/analytics/terraform/spark-k8s-operator/README.md b/analytics/terraform/spark-k8s-operator/README.md index 6d79d5a72..0117392dd 100644 --- a/analytics/terraform/spark-k8s-operator/README.md +++ b/analytics/terraform/spark-k8s-operator/README.md @@ -31,6 +31,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 20.26 | | [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 | | [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | 1.34 | +| [jupyterhub\_single\_user\_irsa](#module\_jupyterhub\_single\_user\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.52.0 | | [s3\_bucket](#module\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 | | [spark\_team\_irsa](#module\_spark\_team\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 | | [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | ~> 5.0 | @@ -53,8 +54,11 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | [kubernetes_annotations.gp2_default](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/annotations) | resource | | [kubernetes_cluster_role.spark_role](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource | | [kubernetes_cluster_role_binding.spark_role_binding](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role_binding) | resource | +| [kubernetes_namespace.jupyterhub](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace) | resource | | [kubernetes_namespace_v1.spark_team](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource | +| [kubernetes_secret_v1.jupyterhub_single_user](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource | | [kubernetes_secret_v1.spark_team](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource | +| [kubernetes_service_account_v1.jupyterhub_single_user_sa](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource | | [kubernetes_service_account_v1.spark_team](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account_v1) | resource | | [kubernetes_storage_class.ebs_csi_encrypted_gp3_storage_class](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/storage_class) | resource | | [random_password.grafana](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource | @@ -77,6 +81,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.31"` | no | | [eks\_data\_plane\_subnet\_secondary\_cidr](#input\_eks\_data\_plane\_subnet\_secondary\_cidr) | Secondary CIDR blocks. 32766 IPs per Subnet per Subnet/AZ for EKS Node and Pods | `list(string)` |
[
"100.64.0.0/17",
"100.64.128.0/17"
]
| no | | [enable\_amazon\_prometheus](#input\_enable\_amazon\_prometheus) | Enable AWS Managed Prometheus service | `bool` | `true` | no | +| [enable\_jupyterhub](#input\_enable\_jupyterhub) | Enable Jupyter Hub | `bool` | `false` | no | | [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `bool` | `false` | no | | [enable\_yunikorn](#input\_enable\_yunikorn) | Enable Apache YuniKorn Scheduler | `bool` | `false` | no | | [kms\_key\_admin\_roles](#input\_kms\_key\_admin\_roles) | list of role ARNs to add to the KMS policy | `list(string)` | `[]` | no | diff --git a/analytics/terraform/spark-k8s-operator/addons.tf b/analytics/terraform/spark-k8s-operator/addons.tf index 2408bf14a..a74683682 100644 --- a/analytics/terraform/spark-k8s-operator/addons.tf +++ b/analytics/terraform/spark-k8s-operator/addons.tf @@ -424,6 +424,16 @@ module "eks_data_addons" { repository_password = data.aws_ecrpublic_authorization_token.token.password } + #--------------------------------------------------------------- + # JupyterHub Add-on + #--------------------------------------------------------------- + enable_jupyterhub = var.enable_jupyterhub + jupyterhub_helm_config = { + values = [templatefile("${path.module}/helm-values/jupyterhub-singleuser-values.yaml", { + jupyter_single_user_sa_name = var.enable_jupyterhub ? kubernetes_service_account_v1.jupyterhub_single_user_sa[0].metadata[0].name : "no-tused" + })] + version = "3.3.8" + } } #--------------------------------------------------------------- @@ -648,6 +658,8 @@ resource "aws_secretsmanager_secret_version" "grafana" { #--------------------------------------------------------------- # S3Table IAM policy for Karpenter nodes +# The S3 tables library does not fully support IRSA and Pod Identity as of this writing. +# We give the node role access to S3tables to work around this limitation. #--------------------------------------------------------------- resource "aws_iam_policy" "s3tables_policy" { name_prefix = "${local.name}-s3tables" @@ -665,7 +677,9 @@ resource "aws_iam_policy" "s3tables_policy" { "s3tables:GetNamespace", "s3tables:GetTableBucket", "s3tables:GetTableBucketMaintenanceConfiguration", - "s3tables:GetTableBucketPolicy" + "s3tables:GetTableBucketPolicy", + "s3tables:CreateNamespace", + "s3tables:CreateTable" ] Resource = "arn:aws:s3tables:*:${data.aws_caller_identity.current.account_id}:bucket/*" }, diff --git a/analytics/terraform/spark-k8s-operator/examples/s3-tables/Dockerfile-S3Table-notebook b/analytics/terraform/spark-k8s-operator/examples/s3-tables/Dockerfile-S3Table-notebook new file mode 100644 index 000000000..f15b1ec2a --- /dev/null +++ b/analytics/terraform/spark-k8s-operator/examples/s3-tables/Dockerfile-S3Table-notebook @@ -0,0 +1,51 @@ +#-------------------------------------------------------------------------------------------- +# Dockerfile for Apache Spark 3.3.1 with S3A Support on multi-arch platforms (AMD64 & ARM64) +#-------------------------------------------------------------------------------------------- +# Step1: Create a Private or Public ECR repo from AWS Console or CLI +# e.g., aws ecr-public create-repository --repository-name spark --region us-east-1 +#--- +# Step2: Docker Login: +# aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/ +#--- +# Step3: Build multi arch image and push it to ECR: +# docker buildx build --platform linux/amd64,linux/arm64 -t public.ecr.aws//spark:3.5.3-scala2.12-java17-python3-ubuntu --push . +#-------------------------------------------------------------------------------------------- + +# Use the official pyspark notebook base image +FROM quay.io/jupyter/pyspark-notebook:spark-3.5.3 + +# Arguments for version control +ARG HADOOP_VERSION=3.4.1 +ARG PREV_HADOOP_VERSION=3.3.4 +ARG AWS_SDK_VERSION=2.29.45 +ARG ICEBERG_VERSION=1.6.1 +ARG S3_TABLES_VERSION=0.1.3 +ARG NOTEBOOK_USER=1000 + +# Set environment variables +ENV HADOOP_DIR=/usr/local/spark-3.5.3-bin-hadoop3 + +# Set up as root to install dependencies and tools +USER root + +# Remove any old Hadoop libraries to avoid conflicts +RUN rm -f ${HADOOP_DIR}/jars/hadoop-client-* && \ + rm -f ${HADOOP_DIR}/jars/hadoop-yarn-server-web-proxy-*.jar + +# Add Hadoop AWS connector and related Hadoop dependencies +RUN cd ${HADOOP_DIR}/jars && \ + wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -O hadoop-aws-${PREV_HADOOP_VERSION}.jar && \ + wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-api/${HADOOP_VERSION}/hadoop-client-api-${HADOOP_VERSION}.jar -O hadoop-client-api-${PREV_HADOOP_VERSION}.jar && \ + wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-runtime/${HADOOP_VERSION}/hadoop-client-runtime-${HADOOP_VERSION}.jar -O hadoop-client-runtime-${PREV_HADOOP_VERSION}.jar && \ + wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/${HADOOP_VERSION}/hadoop-common-${HADOOP_VERSION}.jar -O hadoop-common-${PREV_HADOOP_VERSION}.jar && \ + wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-yarn-server-web-proxy/${HADOOP_VERSION}/hadoop-yarn-server-web-proxy-${HADOOP_VERSION}.jar -O hadoop-yarn-server-web-proxy-${PREV_HADOOP_VERSION}.jar + +# Add Iceberg, AWS SDK bundle, and S3 Tables Catalog for Iceberg runtime +RUN cd ${HADOOP_DIR}/jars && \ + wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_2.12-${ICEBERG_VERSION}.jar && \ + wget https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/${AWS_SDK_VERSION}/bundle-${AWS_SDK_VERSION}.jar && \ + wget https://repo1.maven.org/maven2/software/amazon/s3tables/s3-tables-catalog-for-iceberg-runtime/${S3_TABLES_VERSION}/s3-tables-catalog-for-iceberg-runtime-${S3_TABLES_VERSION}.jar + + +# Switch to non-root user for security best practices +USER ${NOTEBOOK_USER} diff --git a/analytics/terraform/spark-k8s-operator/examples/s3-tables/README.md b/analytics/terraform/spark-k8s-operator/examples/s3-tables/README.md index b57b20430..ec8a0d989 100644 --- a/analytics/terraform/spark-k8s-operator/examples/s3-tables/README.md +++ b/analytics/terraform/spark-k8s-operator/examples/s3-tables/README.md @@ -1,167 +1,3 @@ -# S3Table with OSS Spark on EKS Guide +# S3Table with OSS Spark on EKS -This guide provides step-by-step instructions for setting up and running a Spark job on Amazon EKS using S3Table for data storage. - -## Prerequisites - -- Latest version of AWS CLI installed (must include S3Tables API support) - -## Step 1: Deploy Spark Cluster on EKS - -Follow the steps to deploy Spark Cluster on EKS - -[Spark Operator on EKS with YuniKorn Scheduler](https://awslabs.github.io/data-on-eks/docs/blueprints/data-analytics/spark-operator-yunikorn#prerequisites) - -Once your cluster is up and running, proceed with the following steps to execute a sample Spark job using S3Tables. - -## Step 2: Create Test Data for the job - -Navigate to the example directory and Generate sample data: - -```sh -cd analytics/terraform/spark-k8s-operator/examples/s3-tables -./input-data-gen.sh -``` - -This will create a file called `employee_data.csv` locally with 100 records. Modify the script to adjust the number of records as needed. - -## Step 3: Upload Test Input data to your S3 Bucket - -Replace `` with the name of the S3 bucket created by your blueprint and run the below command. - -```sh -aws s3 cp employee_data.csv s3:///s3table-example/input/ -``` - -## Step 4: Upload PySpark Script to S3 Bucket - -Replace `` with the name of the S3 bucket created by your blueprint and run the below command to upload sample Spark job to S3 buckets. - -```sh -aws s3 cp s3table-iceberg-pyspark.py s3:///s3table-example/scripts/ -``` - -## Step 5: Create S3Table - -Replace and with desired names. - -```sh -aws s3tables create-table-bucket \ - --region "" \ - --name "" -``` - -Make note of the S3TABLE ARN generated by this command. - -## Step 6: Update Spark Operator YAML File - - - Open `s3table-spark-operator.yaml` file in your preferred text editor. - - Replace `` with your S3 bucket created by this blueprint(Check Terraform outputs). S3 Bucket where you copied test data and sample spark job in the above steps. - - REPLACE `` with your S3 Table ARN. - -## Step 7: Execute Spark Job - -Apply the updated YAML file to your Kubernetes cluster to submit the Spark Job. - -```sh -cd analytics/terraform/spark-k8s-operator/examples/s3-tables -kubectl apply -f s3table-spark-operator.yaml -``` - -## Step 8: Verify the Spark Driver log for the output - -Check the Spark driver logs to verify job progress and output: - -```sh -kubectl logs -n spark-team-a -``` - -## Step 9: Verify the S3Table using S3Table API - -Use the S3Table API to confirm the table was created successfully. Just replace the `` and run the command. - -```sh -aws s3tables get-table --table-bucket-arn arn:aws:s3tables:us-west-2::bucket/doeks-spark-on-eks-s3table --namespace doeks_namespace --name employee_s3_table -``` - -Output looks like below. - -```json -{ - "name": "employee_s3_table", - "type": "customer", - "tableARN": "arn:aws:s3tables:us-west-2::bucket/doeks-spark-on-eks-s3table/table/55511111-7a03-4513-b921-e372b0030daf", - "namespace": [ - "doeks_namespace" - ], - "versionToken": "aafc39ddd462690d2a0c", - "metadataLocation": "s3://55511111-7a03-4513-bumiqc8ihp8rnxymuhyz8t1ammu7ausw2b--table-s3/metadata/00004-62cc4be3-59b5-4647-a78d-1cdf69ec5ed8.metadata.json", - "warehouseLocation": "s3://55511111-7a03-4513-bumiqc8ihp8rnxymuhyz8t1ammu7ausw2b--table-s3", - "createdAt": "2025-01-07T22:14:48.689581+00:00", - "createdBy": "", - "modifiedAt": "2025-01-09T00:06:09.222917+00:00", - "ownerAccountId": "", - "format": "ICEBERG" -} -``` - -Monitor the table maintenance job status: - -```sh -aws s3tables get-table-maintenance-job-status --table-bucket-arn arn:aws:s3tables:us-west-2::bucket/doeks-spark-on-eks-s3table --namespace doeks_namespace --name employee_s3_table -``` - -This command provides information about Iceberg compaction, snapshot management, and unreferenced file removal processes. - -```json -{ - "tableARN": "arn:aws:s3tables:us-west-2::bucket/doeks-spark-on-eks-s3table/table/55511111-7a03-4513-b921-e372b0030daf", - "status": { - "icebergCompaction": { - "status": "Successful", - "lastRunTimestamp": "2025-01-08T01:18:08.857000+00:00" - }, - "icebergSnapshotManagement": { - "status": "Successful", - "lastRunTimestamp": "2025-01-08T22:17:08.811000+00:00" - }, - "icebergUnreferencedFileRemoval": { - "status": "Successful", - "lastRunTimestamp": "2025-01-08T22:17:10.377000+00:00" - } - } -} -``` - -## Step10: Clean up - -Delete the table. - -```bash -aws s3tables delete-table \ - --namespace doeks_namespace \ - --table-bucket-arn ${S3TABLE_ARN} \ - --name employee_s3_table -``` - -Delete the namespace. - -```bash -aws s3tables delete-namespace \ - --namespace doeks_namespace \ - --table-bucket-arn ${S3TABLE_ARN} -``` - -Finally, delete the bucket table - -```bash -aws s3tables delete-table-bucket \ - --region "" \ - --table-bucket-arn ${S3TABLE_ARN} -``` - - -# Conclusion -You have successfully set up and run a Spark job on Amazon EKS using S3Table for data storage. This setup provides a scalable and efficient way to process large datasets using Spark on Kubernetes with the added benefits of S3Table's data management capabilities. - -For more advanced usage, refer to the official AWS documentation on S3Table and Spark on EKS. +** Please see [our website](https://awslabs.github.io/data-on-eks/docs/blueprints/data-analytics/spark-operator-s3tables) for details on how to use this example ** diff --git a/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-iceberg-pyspark.ipynb b/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-iceberg-pyspark.ipynb new file mode 100644 index 000000000..97b012626 --- /dev/null +++ b/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-iceberg-pyspark.ipynb @@ -0,0 +1,251 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "6e966091-0998-449f-9dcd-a2032fa39f36", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "from datetime import datetime\n", + "\n", + "from pyspark.sql import SparkSession\n", + "\n", + "# Logging configuration\n", + "formatter = logging.Formatter('[%(asctime)s] %(levelname)s @ line %(lineno)d: %(message)s')\n", + "handler = logging.StreamHandler(sys.stdout)\n", + "handler.setLevel(logging.INFO)\n", + "handler.setFormatter(formatter)\n", + "logger = logging.getLogger()\n", + "logger.setLevel(logging.INFO)\n", + "logger.addHandler(handler)\n", + "\n", + "\n", + "# Application-specific variables\n", + "dt_string = datetime.now().strftime(\"%Y_%m_%d_%H_%M_%S\")\n", + "AppName = \"EmployeeDataS3TableJob\"\n", + "\n", + "# Replace S3_BUCKET and ACCOUNT_NUMBER with your own values\n", + "input_csv_path = \"s3a:///s3table-example/input/\"\n", + "s3table_arn = \"arn:aws:s3tables:us-west-2::bucket/doeks-spark-s3-tables\"\n", + "namespace = \"doeks_namespace\"\n", + "table_name = \"employee_s3_table\"\n", + "full_table_name = f\"s3tablesbucket.{namespace}.{table_name}\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d38e589b-b0ab-41b6-a339-07f5026a7262", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-01-13 19:47:18,917] INFO @ line 22: Spark session initialized successfully\n" + ] + } + ], + "source": [ + "\n", + "spark = (SparkSession\n", + " .builder\n", + " .appName(f\"{AppName}_{dt_string}\")\n", + " .config(\"spark.sql.extensions\", \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\")\n", + " .config(\"spark.sql.catalog.s3tablesbucket\", \"org.apache.iceberg.spark.SparkCatalog\")\n", + " .config(\"spark.sql.catalog.s3tablesbucket.catalog-impl\", \"software.amazon.s3tables.iceberg.S3TablesCatalog\")\n", + " .config(\"spark.sql.catalog.s3tablesbucket.warehouse\", s3table_arn)\n", + " .config('spark.hadoop.fs.s3.impl', \"org.apache.hadoop.fs.s3a.S3AFileSystem\")\n", + " .config(\"spark.sql.defaultCatalog\", \"s3tablesbucket\")\n", + " .config(\"spark.hadoop.fs.s3a.connection.timeout\", \"1200000\") \\\n", + " .config(\"spark.hadoop.fs.s3a.path.style.access\", \"true\") \\\n", + " .config(\"spark.hadoop.fs.s3a.connection.maximum\", \"200\") \\\n", + " .config(\"spark.hadoop.fs.s3a.fast.upload\", \"true\") \\\n", + " .config(\"spark.hadoop.fs.s3a.readahead.range\", \"256K\") \\\n", + " .config(\"spark.hadoop.fs.s3a.input.fadvise\", \"random\") \\\n", + " .config(\"spark.hadoop.fs.s3a.aws.credentials.provider.mapping\", \"com.amazonaws.auth.WebIdentityTokenCredentialsProvider=software.amazon.awssdk.auth.credentials.WebIdentityTokenFileCredentialsProvider\") \\\n", + " .config(\"spark.hadoop.fs.s3a.aws.credentials.provider\", \"software.amazon.awssdk.auth.credentials.WebIdentityTokenFileCredentialsProvider\")\n", + " .getOrCreate())\n", + "\n", + "spark.sparkContext.setLogLevel(\"DEBUG\")\n", + "logger.info(\"Spark session initialized successfully\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3c157786-d661-44e0-af6a-05b94a9e9b0e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-01-13 19:47:18,923] INFO @ line 6: Creating namespace: doeks_namespace\n", + "[2025-01-13 19:47:21,093] INFO @ line 10: Reading employee data from input CSV: s3a:///s3table-example/input/\n", + "[2025-01-13 19:47:24,560] INFO @ line 13: Previewing employee data schema\n", + "root\n", + " |-- id: integer (nullable = true)\n", + " |-- name: string (nullable = true)\n", + " |-- level: string (nullable = true)\n", + " |-- salary: double (nullable = true)\n", + "\n", + "[2025-01-13 19:47:24,564] INFO @ line 16: Previewing first 10 records from the input data\n", + "+---+-----------+------+--------+\n", + "|id |name |level |salary |\n", + "+---+-----------+------+--------+\n", + "|1 |Employee_1 |Exec |101000.0|\n", + "|2 |Employee_2 |Exec |149000.0|\n", + "|3 |Employee_3 |Junior|86000.0 |\n", + "|4 |Employee_4 |Exec |147500.0|\n", + "|5 |Employee_5 |Exec |74000.0 |\n", + "|6 |Employee_6 |Exec |66500.0 |\n", + "|7 |Employee_7 |Junior|69500.0 |\n", + "|8 |Employee_8 |Exec |116000.0|\n", + "|9 |Employee_9 |Mid |56000.0 |\n", + "|10 |Employee_10|Exec |186500.0|\n", + "+---+-----------+------+--------+\n", + "only showing top 10 rows\n", + "\n", + "[2025-01-13 19:47:24,799] INFO @ line 19: Source data count:\n" + ] + }, + { + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "namespace = \"doeks_namespace\"\n", + "table_name = \"employee_s3_table\"\n", + "full_table_name = f\"s3tablesbucket.{namespace}.{table_name}\"\n", + "\n", + "# Step 1: Create namespace if not exists\n", + "logger.info(f\"Creating namespace: {namespace}\")\n", + "spark.sql(f\"CREATE NAMESPACE IF NOT EXISTS s3tablesbucket.{namespace}\")\n", + "\n", + "# Step 2: Read input CSV data\n", + "logger.info(f\"Reading employee data from input CSV: {input_csv_path}\")\n", + "employee_df = spark.read.csv(input_csv_path, header=True, inferSchema=True)\n", + "\n", + "logger.info(\"Previewing employee data schema\")\n", + "employee_df.printSchema()\n", + "\n", + "logger.info(\"Previewing first 10 records from the input data\")\n", + "employee_df.show(10, truncate=False)\n", + "\n", + "logger.info(\"Source data count:\")\n", + "employee_df.count()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0fe0e464-22f2-43e8-8cfe-36e108359a62", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-01-13 19:47:25,207] INFO @ line 2: Creating/Replacing and writing data to table: s3tablesbucket.doeks_namespace.employee_s3_table\n", + "[2025-01-13 19:47:27,861] INFO @ line 8: Reading data back from Iceberg table: s3tablesbucket.doeks_namespace.employee_s3_table\n", + "[2025-01-13 19:47:28,079] INFO @ line 11: Previewing first 10 records from the Iceberg table\n", + "+---+-----------+------+--------+\n", + "|id |name |level |salary |\n", + "+---+-----------+------+--------+\n", + "|1 |Employee_1 |Exec |101000.0|\n", + "|2 |Employee_2 |Exec |149000.0|\n", + "|3 |Employee_3 |Junior|86000.0 |\n", + "|4 |Employee_4 |Exec |147500.0|\n", + "|5 |Employee_5 |Exec |74000.0 |\n", + "|6 |Employee_6 |Exec |66500.0 |\n", + "|7 |Employee_7 |Junior|69500.0 |\n", + "|8 |Employee_8 |Exec |116000.0|\n", + "|9 |Employee_9 |Mid |56000.0 |\n", + "|10 |Employee_10|Exec |186500.0|\n", + "+---+-----------+------+--------+\n", + "only showing top 10 rows\n", + "\n", + "[2025-01-13 19:47:28,794] INFO @ line 15: Total records in Iceberg table (DataFrame API):\n", + "DataFrame count: 100\n", + "[2025-01-13 19:47:29,153] INFO @ line 19: List the s3table snapshot versions:\n", + "+--------------------+-------------------+---------+-------------------+\n", + "| made_current_at| snapshot_id|parent_id|is_current_ancestor|\n", + "+--------------------+-------------------+---------+-------------------+\n", + "|2025-01-13 19:45:...| 271131297078418895| NULL| false|\n", + "|2025-01-13 19:47:...|1268450705309139006| NULL| true|\n", + "+--------------------+-------------------+---------+-------------------+\n", + "\n", + "[2025-01-13 19:47:29,349] INFO @ line 23: Stopping Spark Session\n" + ] + } + ], + "source": [ + "# Step 3: Create or replace table and write data in one operation\n", + "logger.info(f\"Creating/Replacing and writing data to table: {full_table_name}\")\n", + "(employee_df.writeTo(full_table_name)\n", + " .using(\"iceberg\")\n", + " .createOrReplace())\n", + "\n", + "# Step 4: Read data back from the Iceberg table\n", + "logger.info(f\"Reading data back from Iceberg table: {full_table_name}\")\n", + "iceberg_data_df = spark.read.format(\"iceberg\").load(full_table_name)\n", + "\n", + "logger.info(\"Previewing first 10 records from the Iceberg table\")\n", + "iceberg_data_df.show(10, truncate=False)\n", + "\n", + "# Count records using both DataFrame API and SQL\n", + "logger.info(\"Total records in Iceberg table (DataFrame API):\")\n", + "print(f\"DataFrame count: {iceberg_data_df.count()}\")\n", + "\n", + "# List the table snapshots\n", + "logger.info(\"List the s3table snapshot versions:\")\n", + "spark.sql(f\"SELECT * FROM {full_table_name}.history LIMIT 10\").show()\n", + "\n", + "# Stop Spark session\n", + "logger.info(\"Stopping Spark Session\")\n", + "spark.stop()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a41ac2c9-96a0-420a-b270-8d720a0e094b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analytics/terraform/spark-k8s-operator/helm-values/jupyterhub-singleuser-values.yaml b/analytics/terraform/spark-k8s-operator/helm-values/jupyterhub-singleuser-values.yaml new file mode 100644 index 000000000..7fe1b55ab --- /dev/null +++ b/analytics/terraform/spark-k8s-operator/helm-values/jupyterhub-singleuser-values.yaml @@ -0,0 +1,75 @@ +hub: + db: + pvc: + storage: 50Gi + storageClassName: gp3 + authenticatePrometheus: false + +proxy: + https: + enabled: false + type: offload + service: + type: ClusterIP + +singleuser: + profileList: + - display_name: Data Engineering (CPU) + description: "PySpark Notebooks | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pyspark350: + # CAUTION: Unsupported test image + # This image is created solely for testing and reference purposes. + # Before use, please: + # 1. Review the Dockerfile used to create this image (Dockerfile-S3Table-notebook) + # 2. Create your own image that meets your organization's security requirements + display_name: "PySpark 3.5.3 with S3 Tables support" + default: true + kubespawner_override: + image: public.ecr.aws/data-on-eks/spark:pyspark-notebook-3.5.3-s3table0.1.3 + pyspark341: # Use this if you'd like to customize the base image yourself. + display_name: "PySpark 3.5.3" + kubespawner_override: + image: quay.io/jupyter/pyspark-notebook:spark-3.5.3 + kubespawner_override: + extra_pod_config: + dns_policy: ClusterFirst + node_selector: + NodeGroupType: "SparkComputeOptimized" + karpenter.sh/capacity-type: "on-demand" + cpu_guarantee: 2 + mem_guarantee: 8G + cpu_limit: 4 + mem_limit: 8G + cmd: null + serviceAccountName: ${jupyter_single_user_sa_name} + uid: 0 + cloudMetadata: + # CAUTION: This should be set to true for almost all cases. + # This is set to false for s3-tables-catalog to assume the node role, because there is an issue with credentials injections through IRSA in the s3 tables catalog library. + blockWithIptables: false + extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account + securityContext: + fsGroup: 100 + +# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html +scheduling: + userScheduler: + enabled: true + podPriority: + enabled: true + userPlaceholder: + enabled: false + replicas: 1 +prePuller: + hook: + enabled: false + continuous: + # NOTE: if used with Karpenter, also add user-placeholders + enabled: false + +global: + safeToShowValues: false diff --git a/analytics/terraform/spark-k8s-operator/jupyterhub.tf b/analytics/terraform/spark-k8s-operator/jupyterhub.tf new file mode 100644 index 000000000..5b4c389ed --- /dev/null +++ b/analytics/terraform/spark-k8s-operator/jupyterhub.tf @@ -0,0 +1,53 @@ +#----------------------------------------------------------------------------------------- +# JupyterHub Single User IRSA Configuration +#----------------------------------------------------------------------------------------- +resource "kubernetes_namespace" "jupyterhub" { + count = var.enable_jupyterhub ? 1 : 0 + metadata { + name = "jupyterhub" + } +} + +module "jupyterhub_single_user_irsa" { + count = var.enable_jupyterhub ? 1 : 0 + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.52.0" + role_name = "${module.eks.cluster_name}-jupyterhub-single-user-sa" + + role_policy_arns = { + policy = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" # Policy needs to be defined based in what you need to give access to your notebook instances. + s3tables_policy = aws_iam_policy.s3tables.arn + } + + oidc_providers = { + main = { + provider_arn = module.eks.oidc_provider_arn + namespace_service_accounts = ["${kubernetes_namespace.jupyterhub[0].metadata[0].name}:${module.eks.cluster_name}-jupyterhub-single-user"] + } + } +} + +resource "kubernetes_service_account_v1" "jupyterhub_single_user_sa" { + count = var.enable_jupyterhub ? 1 : 0 + metadata { + name = "${module.eks.cluster_name}-jupyterhub-single-user" + namespace = kubernetes_namespace.jupyterhub[0].metadata[0].name + annotations = { "eks.amazonaws.com/role-arn" : module.jupyterhub_single_user_irsa[0].iam_role_arn } + } + + automount_service_account_token = true +} + +resource "kubernetes_secret_v1" "jupyterhub_single_user" { + count = var.enable_jupyterhub ? 1 : 0 + metadata { + name = "${module.eks.cluster_name}-jupyterhub-single-user-secret" + namespace = kubernetes_namespace.jupyterhub[0].metadata[0].name + annotations = { + "kubernetes.io/service-account.name" = kubernetes_service_account_v1.jupyterhub_single_user_sa[0].metadata[0].name + "kubernetes.io/service-account.namespace" = kubernetes_namespace.jupyterhub[0].metadata[0].name + } + } + + type = "kubernetes.io/service-account-token" +} diff --git a/analytics/terraform/spark-k8s-operator/variables.tf b/analytics/terraform/spark-k8s-operator/variables.tf index 0ff0b5aeb..995e910fc 100644 --- a/analytics/terraform/spark-k8s-operator/variables.tf +++ b/analytics/terraform/spark-k8s-operator/variables.tf @@ -72,6 +72,12 @@ variable "enable_yunikorn" { type = bool } +variable "enable_jupyterhub" { + default = false + description = "Enable Jupyter Hub" + type = bool +} + variable "kms_key_admin_roles" { description = "list of role ARNs to add to the KMS policy" type = list(string) diff --git a/website/docs/blueprints/data-analytics/img/s3tables-jupyter-notebook.png b/website/docs/blueprints/data-analytics/img/s3tables-jupyter-notebook.png new file mode 100644 index 000000000..c74b4a5df Binary files /dev/null and b/website/docs/blueprints/data-analytics/img/s3tables-jupyter-notebook.png differ diff --git a/website/docs/blueprints/data-analytics/img/s3tables-jupyter-select.png b/website/docs/blueprints/data-analytics/img/s3tables-jupyter-select.png new file mode 100644 index 000000000..c6cf83df3 Binary files /dev/null and b/website/docs/blueprints/data-analytics/img/s3tables-jupyter-select.png differ diff --git a/website/docs/blueprints/data-analytics/img/s3tables-jupyter-signin.png b/website/docs/blueprints/data-analytics/img/s3tables-jupyter-signin.png new file mode 100644 index 000000000..239c08e49 Binary files /dev/null and b/website/docs/blueprints/data-analytics/img/s3tables-jupyter-signin.png differ diff --git a/website/docs/blueprints/data-analytics/spark-operator-s3tables.md b/website/docs/blueprints/data-analytics/spark-operator-s3tables.md index 41baadf18..315cc9ec3 100644 --- a/website/docs/blueprints/data-analytics/spark-operator-s3tables.md +++ b/website/docs/blueprints/data-analytics/spark-operator-s3tables.md @@ -393,6 +393,56 @@ Please note that these policies can further be adjusted and make it more granula ::: + +Using S3 Tables with JupyterLab }> + +If you'd like to use JupyterLab to work with S3 Tables interactively, this blueprint allows you to deploy JupyterLab single user instances to your cluster. + +> :warning: JupyterHub configurations available here are for testing purposes only. +> +> :warning: Review the configuration and make necessary changes to meet your security standards. + +### Update Terraform variable and apply + +```bash +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator + +echo 'enable_jupyterhub = true' >> spark-operator.tfvars +terraform apply -var-file spark-operator.tfvars +``` + +### Ensure you have test data available in your S3 bucket + +```bash +cd analytics/terraform/spark-k8s-operator/examples/s3-tables +./input-data-gen.sh +aws s3 cp employee_data.csv s3://${S3_BUCKET}/s3table-example/input/ +``` + +### Access JupyterHub UI and provision a JupyterLab server + +1. Port forward proxy service to your local machine. + + ```bash + kubectl port-forward svc/proxy-public 8888:80 -n jupyterhub + ``` + +1. Go to [`http://localhost:8888`](http://localhost:8888). Enter any username, leave the password field empty, then click "Sign in". + + ![sign in](./img/s3tables-jupyter-signin.png) + +1. Click start. You can also select the upstream PySpark NoteBook image from the drop down list, if you'd like to customize it yourself. + + ![select](./img/s3tables-jupyter-select.png) + +1. Copy examples from the [example Jupyter Notebook](https://github.com/awslabs/data-on-eks/blob/main/analytics/terraform/spark-k8s-operator/examples/s3-tables/s3table-iceberg-pyspark.ipynb) as a starting point to test S3 Tables features interactively. + + **Be sure to update `S3_BUCKET` and `s3table_arn` values in the notebook** + + ![notebook](./img/s3tables-jupyter-notebook.png) + + + Cleanup}> :::caution @@ -424,6 +474,15 @@ aws s3tables delete-table-bucket \ --table-bucket-arn ${S3TABLE_ARN} ``` +## Delete the Jupyter Notebook server + +If you created a Jupyter notebook server + +```bash +kubectl delete pods -n jupyterhub -l component=singleuser-server +``` + + ## Delete the EKS cluster This script will cleanup the environment using `-target` option to ensure all the resources are deleted in correct order.