-
Notifications
You must be signed in to change notification settings - Fork 269
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
aff2a60
commit 316906e
Showing
1 changed file
with
241 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,241 @@ | ||
#!/bin/bash | ||
set -euo pipefail | ||
|
||
# Available env vars: | ||
# $TMP_DIR | ||
# $CLUSTER_NAME | ||
# $KUBECONFIG | ||
# $NODE_TERMINATION_HANDLER_DOCKER_REPO | ||
# $NODE_TERMINATION_HANDLER_DOCKER_TAG | ||
# $WEBHOOK_DOCKER_REPO | ||
# $WEBHOOK_DOCKER_TAG | ||
# $AEMM_URL | ||
# $AEMM_VERSION | ||
|
||
echo "Starting EC2 State Change SQS Test for Node Termination Handler in SQS mode with Prometheus server enabled" | ||
START_TIME=$(date -u +"%Y-%m-%dT%TZ") | ||
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" | ||
PROMETHEUS_HELM_VERSION="41.7.4" | ||
|
||
common_helm_args=() | ||
|
||
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts | ||
helm repo update | ||
retry 5 helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack --version ${PROMETHEUS_HELM_VERSION} --set prometheusOperator.admissionWebhooks.enabled="false" --set grafana.enabled="false" --set nodeExporter.enabled="false" --set kubeStateMetrics.enabled="false" | ||
|
||
localstack_helm_args=( | ||
upgrade | ||
--install | ||
--namespace default | ||
"$CLUSTER_NAME-localstack" | ||
"$SCRIPTPATH/../../config/helm/localstack/" | ||
--set defaultRegion="${AWS_REGION}" | ||
--wait | ||
) | ||
|
||
set -x | ||
helm "${localstack_helm_args[@]}" | ||
set +x | ||
|
||
sleep 10 | ||
|
||
RUN_INSTANCE_CMD="awslocal ec2 run-instances --private-ip-address ${WORKER_IP} --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test},{Key=aws-node-termination-handler/managed,Value=blah}]'" | ||
set -x | ||
localstack_pod=$(kubectl get pods --selector app=localstack --field-selector="status.phase=Running" \ | ||
-o go-template --template '{{range .items}}{{.metadata.name}} {{.metadata.creationTimestamp}}{{"\n"}}{{end}}' \ | ||
| awk '$2 >= "'"${START_TIME//+0000/Z}"'" { print $1 }') | ||
echo "🥑 Using localstack pod $localstack_pod" | ||
run_instances_resp=$(kubectl exec -i "${localstack_pod}" -- bash -c "$RUN_INSTANCE_CMD") | ||
instance_id=$(echo "${run_instances_resp}" | jq -r '.Instances[] .InstanceId') | ||
echo "🥑 Started mock EC2 instance ($instance_id)" | ||
|
||
CREATE_SQS_CMD="awslocal sqs create-queue --queue-name "${CLUSTER_NAME}-queue" --attributes MessageRetentionPeriod=300 --region ${AWS_REGION}" | ||
queue_url=$(kubectl exec -i "${localstack_pod}" -- bash -c "$CREATE_SQS_CMD" | jq -r .QueueUrl) | ||
|
||
echo "🥑 Created SQS Queue ${queue_url}" | ||
|
||
anth_helm_args=( | ||
upgrade | ||
--install | ||
--namespace kube-system | ||
"$CLUSTER_NAME-anth" | ||
"$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" | ||
--set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" | ||
--set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" | ||
--set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" | ||
--set enablePrometheusServer="true" | ||
--set podMonitor.create="true" | ||
--set daemonsetTolerations="" | ||
--set awsAccessKeyID=foo | ||
--set awsSecretAccessKey=bar | ||
--set awsRegion="${AWS_REGION}" | ||
--set awsEndpoint="http://localstack.default" | ||
--set checkTagBeforeDraining=false | ||
--set enableSqsTerminationDraining=true | ||
--set "queueURL=${queue_url}" | ||
--wait | ||
--force | ||
) | ||
[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && | ||
anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") | ||
[[ ${#common_helm_args[@]} -gt 0 ]] && | ||
anth_helm_args+=("${common_helm_args[@]}") | ||
|
||
set -x | ||
helm "${anth_helm_args[@]}" | ||
set +x | ||
|
||
emtp_helm_args=( | ||
upgrade | ||
--install | ||
--namespace default | ||
"$CLUSTER_NAME-emtp" | ||
"$SCRIPTPATH/../../config/helm/webhook-test-proxy/" | ||
--set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" | ||
--set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" | ||
--wait | ||
) | ||
[[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && | ||
emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") | ||
[[ ${#common_helm_args[@]} -gt 0 ]] && | ||
emtp_helm_args+=("${common_helm_args[@]}") | ||
|
||
set -x | ||
helm "${emtp_helm_args[@]}" | ||
set +x | ||
|
||
TAINT_CHECK_CYCLES=15 | ||
TAINT_CHECK_SLEEP=15 | ||
|
||
DEPLOYED=0 | ||
|
||
for i in $(seq 1 10); do | ||
if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then | ||
echo "✅ Verified regular-pod-test pod was scheduled and started!" | ||
DEPLOYED=1 | ||
break | ||
fi | ||
sleep 5 | ||
done | ||
|
||
if [[ $DEPLOYED -eq 0 ]]; then | ||
echo "❌ regular-pod-test pod deployment failed" | ||
fail_and_exit 2 | ||
fi | ||
|
||
|
||
EC2_STATE_CHANGE_EVENT=$(cat <<EOF | ||
{ | ||
"version": "0", | ||
"id": "7bf73129-1428-4cd3-a780-95db273d1602", | ||
"detail-type": "EC2 Instance State-change Notification", | ||
"source": "aws.ec2", | ||
"account": "123456789012", | ||
"time": "$(date -u +"%Y-%m-%dT%TZ")", | ||
"region": "us-east-1", | ||
"resources": [ | ||
"arn:aws:ec2:us-east-1:123456789012:instance/${instance_id}" | ||
], | ||
"detail": { | ||
"instance-id": "${instance_id}", | ||
"state": "stopping" | ||
} | ||
} | ||
EOF | ||
) | ||
|
||
EC2_STATE_CHANGE_EVENT_ONE_LINE=$(echo "${EC2_STATE_CHANGE_EVENT}" | tr -d '\n' |sed 's/\"/\\"/g') | ||
SEND_SQS_CMD="awslocal sqs send-message --queue-url ${queue_url} --message-body \"${EC2_STATE_CHANGE_EVENT_ONE_LINE}\" --region ${AWS_REGION}" | ||
kubectl exec -i "${localstack_pod}" -- bash -c "$SEND_SQS_CMD" | ||
echo "✅ Sent EC2 State Change Event to SQS queue: ${queue_url}" | ||
|
||
GET_ATTRS_SQS_CMD="awslocal sqs get-queue-attributes --queue-url ${queue_url} --attribute-names All --region ${AWS_REGION}" | ||
|
||
cordoned=0 | ||
evicted=0 | ||
test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" | ||
for i in $(seq 1 $TAINT_CHECK_CYCLES); do | ||
if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled > /dev/null; then | ||
echo "✅ Verified the worker node was cordoned!" | ||
cordoned=1 | ||
fi | ||
|
||
if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then | ||
echo "✅ Verified the regular-pod-test pod was evicted!" | ||
evicted=1 | ||
fi | ||
|
||
if [[ ${evicted} -eq 1 && $(kubectl exec -i "${localstack_pod}" -- bash -c "$GET_ATTRS_SQS_CMD" | jq '(.Attributes.ApproximateNumberOfMessagesNotVisible|tonumber) + (.Attributes.ApproximateNumberOfMessages|tonumber)' ) -eq 0 ]]; then | ||
kubectl exec -i "${localstack_pod}" -- bash -c "$GET_ATTRS_SQS_CMD" | ||
echo "✅ Verified the message was deleted from the queue after processing!" | ||
fi | ||
|
||
echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" | ||
sleep $TAINT_CHECK_SLEEP | ||
done | ||
|
||
if [[ $cordoned -eq 0 ]]; then | ||
echo "❌ Worker node was not cordoned" | ||
else | ||
echo "❌ regular-pod-test was not evicted" | ||
fi | ||
|
||
|
||
POD_NAME=$(get_nth_worker_pod) | ||
echo "✅ Fetched the pod $POD_NAME " | ||
|
||
kubectl -n kube-system port-forward "$POD_NAME" 7000:9092 & | ||
PORT_FORWARD_PID=$! | ||
trap 'kill ${PORT_FORWARD_PID}' EXIT SIGINT SIGTERM ERR | ||
echo "✅ Port-forwarded pod $POD_NAME" | ||
|
||
sleep 10 | ||
|
||
for i in $(seq 1 $TAINT_CHECK_CYCLES); do | ||
METRICS_RESPONSE=$(curl -L localhost:7000/metrics) | ||
echo "✅ Fetched /metrics." | ||
failed="" | ||
for METRIC in cordon-and-drain post-drain nth_tagged_instances nth_tagged_nodes runtime_go_gc runtime_go_goroutines runtime_go_mem; do | ||
if [[ $METRICS_RESPONSE == *"$METRIC"* ]]; then | ||
echo "✅ Metric $METRIC!" | ||
else | ||
echo "⚠️ Metric $METRIC" | ||
failed=$METRIC | ||
break | ||
fi | ||
done | ||
if [ -z $failed ]; then | ||
break | ||
fi | ||
echo "Metrics Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" | ||
sleep $TAINT_CHECK_SLEEP | ||
done | ||
|
||
if [[ -n $failed ]];then | ||
exit 4 | ||
fi | ||
|
||
metric_name="actions_total" | ||
for action in cordon-and-drain post-drain; do | ||
labels='node_action="'$action'",node_status="success",otel_scope_name="aws.node.termination.handler",otel_scope_version=""' | ||
query="$metric_name{$labels}" | ||
counter_value=$(echo "$METRICS_RESPONSE" | grep -E "${query}[[:space:]]+[0-9]+" | awk '{print $NF}') | ||
if (($counter_value < 1)); then | ||
echo "❌ Failed counter count for metric action:$action" | ||
exit 5 | ||
fi | ||
echo "✅ Fetched counter:$counter_value for metric with action:$action" | ||
done | ||
|
||
for gauge in nth_tagged_instances; do | ||
query=''$gauge'{otel_scope_name="aws.node.termination.handler",otel_scope_version=""}' | ||
counter_value=$(echo "$METRICS_RESPONSE" | grep -E "${query}[[:space:]]+[0-9]+" | awk '{print $NF}') | ||
if (($counter_value < 1)); then | ||
echo "❌ Failed gauge count for metric:$gauge" | ||
exit 5 | ||
fi | ||
echo "✅ Fetched gauge:$counter_value for metric:$gauge" | ||
done | ||
|
||
|
||
exit 0 |