Skip to content

Commit

Permalink
add e2e test
Browse files Browse the repository at this point in the history
  • Loading branch information
phuhung273 committed Jan 16, 2025
1 parent aff2a60 commit 316906e
Showing 1 changed file with 241 additions and 0 deletions.
241 changes: 241 additions & 0 deletions test/e2e/prometheus-metrics-sqs-test
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
#!/bin/bash
set -euo pipefail

# Available env vars:
# $TMP_DIR
# $CLUSTER_NAME
# $KUBECONFIG
# $NODE_TERMINATION_HANDLER_DOCKER_REPO
# $NODE_TERMINATION_HANDLER_DOCKER_TAG
# $WEBHOOK_DOCKER_REPO
# $WEBHOOK_DOCKER_TAG
# $AEMM_URL
# $AEMM_VERSION

echo "Starting EC2 State Change SQS Test for Node Termination Handler in SQS mode with Prometheus server enabled"
START_TIME=$(date -u +"%Y-%m-%dT%TZ")
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
PROMETHEUS_HELM_VERSION="41.7.4"

common_helm_args=()

helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
retry 5 helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack --version ${PROMETHEUS_HELM_VERSION} --set prometheusOperator.admissionWebhooks.enabled="false" --set grafana.enabled="false" --set nodeExporter.enabled="false" --set kubeStateMetrics.enabled="false"

localstack_helm_args=(
upgrade
--install
--namespace default
"$CLUSTER_NAME-localstack"
"$SCRIPTPATH/../../config/helm/localstack/"
--set defaultRegion="${AWS_REGION}"
--wait
)

set -x
helm "${localstack_helm_args[@]}"
set +x

sleep 10

RUN_INSTANCE_CMD="awslocal ec2 run-instances --private-ip-address ${WORKER_IP} --region ${AWS_REGION} --tag-specifications 'ResourceType=instance,Tags=[{Key=aws:autoscaling:groupName,Value=nth-integ-test},{Key=aws-node-termination-handler/managed,Value=blah}]'"
set -x
localstack_pod=$(kubectl get pods --selector app=localstack --field-selector="status.phase=Running" \
-o go-template --template '{{range .items}}{{.metadata.name}} {{.metadata.creationTimestamp}}{{"\n"}}{{end}}' \
| awk '$2 >= "'"${START_TIME//+0000/Z}"'" { print $1 }')
echo "🥑 Using localstack pod $localstack_pod"
run_instances_resp=$(kubectl exec -i "${localstack_pod}" -- bash -c "$RUN_INSTANCE_CMD")
instance_id=$(echo "${run_instances_resp}" | jq -r '.Instances[] .InstanceId')
echo "🥑 Started mock EC2 instance ($instance_id)"

CREATE_SQS_CMD="awslocal sqs create-queue --queue-name "${CLUSTER_NAME}-queue" --attributes MessageRetentionPeriod=300 --region ${AWS_REGION}"
queue_url=$(kubectl exec -i "${localstack_pod}" -- bash -c "$CREATE_SQS_CMD" | jq -r .QueueUrl)

echo "🥑 Created SQS Queue ${queue_url}"

anth_helm_args=(
upgrade
--install
--namespace kube-system
"$CLUSTER_NAME-anth"
"$SCRIPTPATH/../../config/helm/aws-node-termination-handler/"
--set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}"
--set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO"
--set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG"
--set enablePrometheusServer="true"
--set podMonitor.create="true"
--set daemonsetTolerations=""
--set awsAccessKeyID=foo
--set awsSecretAccessKey=bar
--set awsRegion="${AWS_REGION}"
--set awsEndpoint="http://localstack.default"
--set checkTagBeforeDraining=false
--set enableSqsTerminationDraining=true
--set "queueURL=${queue_url}"
--wait
--force
)
[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] &&
anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY")
[[ ${#common_helm_args[@]} -gt 0 ]] &&
anth_helm_args+=("${common_helm_args[@]}")

set -x
helm "${anth_helm_args[@]}"
set +x

emtp_helm_args=(
upgrade
--install
--namespace default
"$CLUSTER_NAME-emtp"
"$SCRIPTPATH/../../config/helm/webhook-test-proxy/"
--set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO"
--set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG"
--wait
)
[[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] &&
emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY")
[[ ${#common_helm_args[@]} -gt 0 ]] &&
emtp_helm_args+=("${common_helm_args[@]}")

set -x
helm "${emtp_helm_args[@]}"
set +x

TAINT_CHECK_CYCLES=15
TAINT_CHECK_SLEEP=15

DEPLOYED=0

for i in $(seq 1 10); do
if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then
echo "✅ Verified regular-pod-test pod was scheduled and started!"
DEPLOYED=1
break
fi
sleep 5
done

if [[ $DEPLOYED -eq 0 ]]; then
echo "❌ regular-pod-test pod deployment failed"
fail_and_exit 2
fi


EC2_STATE_CHANGE_EVENT=$(cat <<EOF
{
"version": "0",
"id": "7bf73129-1428-4cd3-a780-95db273d1602",
"detail-type": "EC2 Instance State-change Notification",
"source": "aws.ec2",
"account": "123456789012",
"time": "$(date -u +"%Y-%m-%dT%TZ")",
"region": "us-east-1",
"resources": [
"arn:aws:ec2:us-east-1:123456789012:instance/${instance_id}"
],
"detail": {
"instance-id": "${instance_id}",
"state": "stopping"
}
}
EOF
)

EC2_STATE_CHANGE_EVENT_ONE_LINE=$(echo "${EC2_STATE_CHANGE_EVENT}" | tr -d '\n' |sed 's/\"/\\"/g')
SEND_SQS_CMD="awslocal sqs send-message --queue-url ${queue_url} --message-body \"${EC2_STATE_CHANGE_EVENT_ONE_LINE}\" --region ${AWS_REGION}"
kubectl exec -i "${localstack_pod}" -- bash -c "$SEND_SQS_CMD"
echo "✅ Sent EC2 State Change Event to SQS queue: ${queue_url}"

GET_ATTRS_SQS_CMD="awslocal sqs get-queue-attributes --queue-url ${queue_url} --attribute-names All --region ${AWS_REGION}"

cordoned=0
evicted=0
test_node="${TEST_NODE:-$CLUSTER_NAME-worker}"
for i in $(seq 1 $TAINT_CHECK_CYCLES); do
if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled > /dev/null; then
echo "✅ Verified the worker node was cordoned!"
cordoned=1
fi

if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
echo "✅ Verified the regular-pod-test pod was evicted!"
evicted=1
fi

if [[ ${evicted} -eq 1 && $(kubectl exec -i "${localstack_pod}" -- bash -c "$GET_ATTRS_SQS_CMD" | jq '(.Attributes.ApproximateNumberOfMessagesNotVisible|tonumber) + (.Attributes.ApproximateNumberOfMessages|tonumber)' ) -eq 0 ]]; then
kubectl exec -i "${localstack_pod}" -- bash -c "$GET_ATTRS_SQS_CMD"
echo "✅ Verified the message was deleted from the queue after processing!"
fi

echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
sleep $TAINT_CHECK_SLEEP
done

if [[ $cordoned -eq 0 ]]; then
echo "❌ Worker node was not cordoned"
else
echo "❌ regular-pod-test was not evicted"
fi


POD_NAME=$(get_nth_worker_pod)
echo "✅ Fetched the pod $POD_NAME "

kubectl -n kube-system port-forward "$POD_NAME" 7000:9092 &
PORT_FORWARD_PID=$!
trap 'kill ${PORT_FORWARD_PID}' EXIT SIGINT SIGTERM ERR
echo "✅ Port-forwarded pod $POD_NAME"

sleep 10

for i in $(seq 1 $TAINT_CHECK_CYCLES); do
METRICS_RESPONSE=$(curl -L localhost:7000/metrics)
echo "✅ Fetched /metrics."
failed=""
for METRIC in cordon-and-drain post-drain nth_tagged_instances nth_tagged_nodes runtime_go_gc runtime_go_goroutines runtime_go_mem; do
if [[ $METRICS_RESPONSE == *"$METRIC"* ]]; then
echo "✅ Metric $METRIC!"
else
echo "⚠️ Metric $METRIC"
failed=$METRIC
break
fi
done
if [ -z $failed ]; then
break
fi
echo "Metrics Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
sleep $TAINT_CHECK_SLEEP
done

if [[ -n $failed ]];then
exit 4
fi

metric_name="actions_total"
for action in cordon-and-drain post-drain; do
labels='node_action="'$action'",node_status="success",otel_scope_name="aws.node.termination.handler",otel_scope_version=""'
query="$metric_name{$labels}"
counter_value=$(echo "$METRICS_RESPONSE" | grep -E "${query}[[:space:]]+[0-9]+" | awk '{print $NF}')
if (($counter_value < 1)); then
echo "❌ Failed counter count for metric action:$action"
exit 5
fi
echo "✅ Fetched counter:$counter_value for metric with action:$action"
done

for gauge in nth_tagged_instances; do
query=''$gauge'{otel_scope_name="aws.node.termination.handler",otel_scope_version=""}'
counter_value=$(echo "$METRICS_RESPONSE" | grep -E "${query}[[:space:]]+[0-9]+" | awk '{print $NF}')
if (($counter_value < 1)); then
echo "❌ Failed gauge count for metric:$gauge"
exit 5
fi
echo "✅ Fetched gauge:$counter_value for metric:$gauge"
done


exit 0

0 comments on commit 316906e

Please sign in to comment.