-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy path1-llama3-finetune-trn1-create-raycluster.yaml
151 lines (145 loc) · 6.18 KB
/
1-llama3-finetune-trn1-create-raycluster.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# This RayCluster configuration deploys a distributed training environment for Llama3.1 8B model
# using AWS Neuron SDK and RayTrain on Amazon EKS.
# ----------------------------------------------------------------------
# NOTE: For detailed deployment instructions, refer to the DoEKS website (https://awslabs.github.io/data-on-eks/docs/category/training-on-eks).
# ----------------------------------------------------------------------
# ----------------------------------------------------------------------
# NOTE: We are using the default namespace for this deployment because the fsx-claim PVC is created under the default namespace by the Terraform blueprint.
# If you want to deploy the cluster in a dedicated namespace, ensure that the FSX for Lustre file system is also created in the same namespace since PVCs are namespace-bound.
# ----------------------------------------------------------------------
# Docs for Volcano with KubeRay: https://docs.ray.io/en/master/cluster/kubernetes/k8s-ecosystem/volcano.html
---
apiVersion: scheduling.volcano.sh/v1beta1
kind: Queue
metadata:
name: llama3-training-queue
namespace: default
spec:
weight: 1
capability:
cpu: '500'
memory: 1500Gi
---
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: kuberay-trn1
namespace: default
labels:
ray.io/scheduler-name: volcano
volcano.sh/queue-name: llama3-training-queue
spec:
rayVersion: 2.22.0
headGroupSpec:
# Head Node Configuration
# This section defines the specification for the Ray head pod.
# The head node manages the cluster and provides services like the dashboard and Global Control Store(GCS).
template:
spec:
containers:
- name: ray-head
#image: 111222333444.dkr.ecr.us-west-2.amazonaws.com/kuberay_trn1_llama3.1_pytorch2:latest
image: <AWS_ACCOUNT_ID>.dkr.ecr.<REGION>.amazonaws.com/kuberay_trn1_llama3.1_pytorch2:latest
imagePullPolicy: Always # Pull the latest image each time
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"] # Graceful shutdown of Ray processes
ports:
- containerPort: 8265
name: dashboard # Expose Ray dashboard
- containerPort: 6379
name: redis # Expose Redis port
- containerPort: 10001
name: object-manager # Expose object manager port
resources:
requests:
cpu: 6
memory: 30Gi
volumeMounts:
- mountPath: /tmp/ray
name: log-volume # Mount for Ray logs
- name: persistent-storage # Mount shared filesystem (FSx for Lustre)
mountPath: /shared
# Node Selector for Karpenter
# Karpenter will provision this head pod on a node with the specified labels.
nodeSelector:
instanceType: mixed-x86
provisionerType: Karpenter
volumes:
- name: log-volume
emptyDir: {}
- name: persistent-storage
persistentVolumeClaim:
claimName: fsx-claim # Reference the PVC for shared storage
rayStartParams:
dashboard-host: 0.0.0.0 # Make dashboard accessible
workerGroupSpecs:
# Worker Node Configuration
# This section defines the specification for the Ray worker pods.
# Worker nodes execute tasks and participate in distributed training.
- groupName: workergroup
replicas: 2 # Number of worker replicas
minReplicas: 2 # Minimum number of worker replicas
maxReplicas: 2 # Maximum number of worker replicas (no scaling in this case)
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
#image: 111222333444.dkr.ecr.us-west-2.amazonaws.com/kuberay_trn1_llama3.1_pytorch2:latest
image: <AWS_ACCOUNT_ID>.dkr.ecr.<REGION>.amazonaws.com/kuberay_trn1_llama3.1_pytorch2:latest
imagePullPolicy: Always # Pull the latest image each time
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "ray stop"]
ports:
- containerPort: 8265
name: dashboard
- containerPort: 6379
name: redis
- containerPort: 10001
name: object-manager
resources:
limits:
aws.amazon.com/neuron: '16' # Request AWS Neuron cores
vpc.amazonaws.com/efa: '8' # Request AWS EFA devices
memory: 440Gi
requests:
aws.amazon.com/neuron: '16'
vpc.amazonaws.com/efa: '8'
cpu: '120'
memory: 440Gi
volumeMounts:
- name: persistent-storage
mountPath: /shared # Mount shared filesystem (FSx for Lustre)
- name: dshm
mountPath: /dev/shm # Mount for shared memory
- mountPath: /tmp/ray
name: log-volume # Mount for Ray logs
# Node Selector for Managed Node Group (with Cluster Autoscaler)
# These workers will run on Trn1 instances provisioned by the cluster autoscaler.
# This is necessary as Karpenter doesn't currently support EFA (required for Neuron distributed training).
nodeSelector:
instance-type: trn1-32xl
provisioner: cluster-autoscaler
# Tolerations for Trn1 and Dedicated Nodes
tolerations:
- key: "aws.amazon.com/neuron"
operator: "Exists"
effect: "NoSchedule"
- key: "hub.jupyter.org/dedicated"
operator: "Equal"
value: "user"
effect: "NoSchedule"
volumes:
# Persistent Volume Claim (PVC) to access the FSx for Lustre filesystem
- name: persistent-storage
persistentVolumeClaim:
claimName: fsx-claim
- name: dshm
emptyDir:
medium: Memory
- name: log-volume
emptyDir: {}