llama3.1_8B_finetune_ray_ptl_neuron/1-llama3-finetune-trn1-create-raycluster.yaml

# This RayCluster configuration deploys a distributed training environment for Llama3.1 8B model
# using AWS Neuron SDK and RayTrain on Amazon EKS.

# ----------------------------------------------------------------------
# NOTE: For detailed deployment instructions, refer to the DoEKS website (https://awslabs.github.io/data-on-eks/docs/category/training-on-eks).
# ----------------------------------------------------------------------

# ----------------------------------------------------------------------
# NOTE: We are using the default namespace for this deployment because the fsx-claim PVC is created under the default namespace by the Terraform blueprint.
# If you want to deploy the cluster in a dedicated namespace, ensure that the FSX for Lustre file system is also created in the same namespace since PVCs are namespace-bound.
# ----------------------------------------------------------------------

# Docs for Volcano with KubeRay: https://docs.ray.io/en/master/cluster/kubernetes/k8s-ecosystem/volcano.html
---
apiVersion: scheduling.volcano.sh/v1beta1
kind: Queue
metadata:
  name: llama3-training-queue
  namespace: default
spec:
  weight: 1
  capability:
    cpu: '500'
    memory: 1500Gi

---
apiVersion: ray.io/v1
kind: RayCluster
metadata:
  name: kuberay-trn1
  namespace: default
  labels:
    ray.io/scheduler-name: volcano
    volcano.sh/queue-name: llama3-training-queue
spec:
  rayVersion: 2.22.0
  headGroupSpec:
    # Head Node Configuration
    # This section defines the specification for the Ray head pod.
    # The head node manages the cluster and provides services like the dashboard and Global Control Store(GCS).
    template:
      spec:
        containers:
          - name: ray-head
            #image: 111222333444.dkr.ecr.us-west-2.amazonaws.com/kuberay_trn1_llama3.1_pytorch2:latest
            image: <AWS_ACCOUNT_ID>.dkr.ecr.<REGION>.amazonaws.com/kuberay_trn1_llama3.1_pytorch2:latest
            imagePullPolicy: Always # Pull the latest image each time
            lifecycle:
              preStop:
                exec:
                  command: ["/bin/sh", "-c", "ray stop"]  # Graceful shutdown of Ray processes
            ports:
              - containerPort: 8265
                name: dashboard   # Expose Ray dashboard
              - containerPort: 6379
                name: redis       # Expose Redis port
              - containerPort: 10001
                name: object-manager # Expose object manager port
            resources:
              requests:
                cpu: 6
                memory: 30Gi
            volumeMounts:
              - mountPath: /tmp/ray
                name: log-volume   # Mount for Ray logs
              - name: persistent-storage # Mount shared filesystem (FSx for Lustre)
                mountPath: /shared
        # Node Selector for Karpenter
        # Karpenter will provision this head pod on a node with the specified labels.
        nodeSelector:
          instanceType: mixed-x86
          provisionerType: Karpenter
        volumes:
          - name: log-volume
            emptyDir: {}
          - name: persistent-storage
            persistentVolumeClaim:
              claimName: fsx-claim   # Reference the PVC for shared storage
    rayStartParams:
      dashboard-host: 0.0.0.0    # Make dashboard accessible

  workerGroupSpecs:
    # Worker Node Configuration
    # This section defines the specification for the Ray worker pods.
    # Worker nodes execute tasks and participate in distributed training.
    - groupName: workergroup
      replicas: 2  # Number of worker replicas
      minReplicas: 2 # Minimum number of worker replicas
      maxReplicas: 2 # Maximum number of worker replicas (no scaling in this case)
      rayStartParams: {}
      template:
        spec:
          containers:
            - name: ray-worker
              #image: 111222333444.dkr.ecr.us-west-2.amazonaws.com/kuberay_trn1_llama3.1_pytorch2:latest
              image: <AWS_ACCOUNT_ID>.dkr.ecr.<REGION>.amazonaws.com/kuberay_trn1_llama3.1_pytorch2:latest
              imagePullPolicy: Always # Pull the latest image each time
              lifecycle:
                preStop:
                  exec:
                    command: ["/bin/sh", "-c", "ray stop"]
              ports:
                - containerPort: 8265
                  name: dashboard
                - containerPort: 6379
                  name: redis
                - containerPort: 10001
                  name: object-manager
              resources:
                limits:
                  aws.amazon.com/neuron: '16'  # Request AWS Neuron cores
                  vpc.amazonaws.com/efa: '8'   # Request AWS EFA devices
                  memory: 440Gi
                requests:
                  aws.amazon.com/neuron: '16'
                  vpc.amazonaws.com/efa: '8'
                  cpu: '120'
                  memory: 440Gi
              volumeMounts:
                - name: persistent-storage
                  mountPath: /shared   # Mount shared filesystem (FSx for Lustre)
                - name: dshm
                  mountPath: /dev/shm   # Mount for shared memory
                - mountPath: /tmp/ray
                  name: log-volume     # Mount for Ray logs
          # Node Selector for Managed Node Group (with Cluster Autoscaler)
          # These workers will run on Trn1 instances provisioned by the cluster autoscaler.
          # This is necessary as Karpenter doesn't currently support EFA (required for Neuron distributed training).
          nodeSelector:
            instance-type: trn1-32xl
            provisioner: cluster-autoscaler

          # Tolerations for Trn1 and Dedicated Nodes
          tolerations:
            - key: "aws.amazon.com/neuron"
              operator: "Exists"
              effect: "NoSchedule"
            - key: "hub.jupyter.org/dedicated"
              operator: "Equal"
              value: "user"
              effect: "NoSchedule"
          volumes:
            # Persistent Volume Claim (PVC) to access the FSx for Lustre filesystem
            - name: persistent-storage
              persistentVolumeClaim:
                claimName: fsx-claim
            - name: dshm
              emptyDir:
                medium: Memory
            - name: log-volume
              emptyDir: {}