From 004cbf99bd447ff3859b86ae4af1a3a2454c642d Mon Sep 17 00:00:00 2001 From: Farley Date: Mon, 24 Jan 2022 19:35:09 +1300 Subject: [PATCH] Sending kubernetes events on modification --- README.md | 22 ++++++++------- helpers.py | 82 +++++++++++++++++++++++++++++++++++++++++++++--------- main.py | 44 ++++++++++++++++++----------- 3 files changed, 108 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index f2941e0..df5481a 100644 --- a/README.md +++ b/README.md @@ -190,8 +190,9 @@ spec: ### [Release: 1.0.3 - Jan 23, 2022](https://github.com/DevOps-Nirvana/Kubernetes-Volume-Autoscaler/releases/tag/1.0.3) ``` Handle signal from Kubernetes to kill/restart properly/quickly -Add full env vars as documentation markdown table, inside notes for development below +Add full env vars as documentation markdown table, inside notes for development section below Adding better exception logs via traceback, and more readable/reasonable log output especially when VERBOSE is enabled +Generate Kubernetes events so everyone viewing the event stream knows when actions occur. AKA, Be an responsible controller ``` ### [Release: 1.0.2 - Jan 15, 2022](https://github.com/DevOps-Nirvana/Kubernetes-Volume-Autoscaler/releases/tag/1.0.2) @@ -217,21 +218,22 @@ Some basic documentation and installation help/guides Current Release: 1.0.2 -This todo list is mostly for the Author(s), but any contributions are also welcome. Please [submit an Issue](https://github.com/DevOps-Nirvana/Kubernetes-Volume-Autoscaler/issues) for issues or requests, or an [Pull Request](https://github.com/DevOps-Nirvana/Kubernetes-Volume-Autoscaler/pulls) if you added some code. +This todo list is mostly for the Author(s), but any contributions are also welcome. Please [submit an Issue](https://github.com/DevOps-Nirvana/Kubernetes-Volume-Autoscaler/issues) for issues/requests, or an [Pull Request](https://github.com/DevOps-Nirvana/Kubernetes-Volume-Autoscaler/pulls) if you added some code. These items are generally put in order of most-important first. -* Add full helm chart values documentation markdown table -* Push to helm repo in a Github Action and push the static yaml as well +* Listen/watch to events of the PV/PVC, or listen/read from Prometheus to monitor and ensure the resizing happens, log and/or slack it accordingly +* Catch WHY resizing failed (try?) and make sure to log/send to slack/k8s events the why +* Check if storage class has the ALLOWVOLUMEEXPANSION to (help?) ensure the expansion will succeed, do something about it and/or report it +* Add full helm chart values documentation markdown table (tie into adding docs for Universal Helm Charts) +* Push to helm repo in a Github Action and push the static yaml as well, make things "easier" and automated +* Add badges to the README regarding Github Actions success/failures * Add tests coverage to ensure the software works as intended moving forward * Do some load testing to see how well this software deals with scale, document how much resources needed for each interval. (10 PVCs, 100+ PVC, 500 PVC) -* Generate kubernetes EVENTS when we resize volumes so everyone knows we are doing things, to be a good controller -* Add badges to the README regarding Github Actions success/failures -* Listen/watch to events of the PV/PVC, or listen/read from Prometheus to monitor and ensure the resizing happens, log and/or slack it accordingly -* Test it and add working examples of using this on other cloud providers (Azure / Google Cloud) -* Make per-PVC annotations to (re)direct Slack to different webhooks and/or different channel(s) +* Make per-PVC annotations to (re)direct Slack to different webhooks and/or different channel(s)? * Discuss what the ideal "default" amount of time before scaling. Currently is 5 minutes (5, 60 minute intervals) * Discuss what the ideal "default" scale up size is, currently 50%. Suggestion has been made to lower this to around 20% +* Add better examples for "best practices" when using Helm (aka, subchart) +* Test it and add working examples of using this on other cloud providers (Azure / Google Cloud) * Auto-detect (or let user choose) a different provider (eg: AWS/Google) and set different per-provider defaults (eg: wait time, min/max disk size, min disk increment, etc) -* Check if storage class has the ALLOWVOLUMEEXPANSION to (help?) ensure the expansion will succeed # Notes for Development diff --git a/helpers.py b/helpers.py index b7df3f5..16aef99 100644 --- a/helpers.py +++ b/helpers.py @@ -1,9 +1,13 @@ from os import getenv # Environment variable handling -import time # Sleep +import time # Sleep/time +import datetime import requests # For making HTTP requests to Prometheus import kubernetes # For talking to the Kubernetes API +from kubernetes.client import ApiException from packaging import version # For checking if prometheus version is new enough to use a new function present_over_time() import signal # For sigkill handling +import random # Random string generation +import traceback # Debugging/trace outputs # Used below in init variables def detectPrometheusURL(): @@ -12,7 +16,7 @@ def detectPrometheusURL(): prometheus_ip_address = getenv('PROMETHEUS_SERVER_SERVICE_HOST') prometheus_port = getenv('PROMETHEUS_SERVER_SERVICE_PORT_HTTP') - # TODO: If there's other ways to also detect prometheus (eg: try http://prometheus-server) please put them here... + # TODO: If there's other ways to also detect prometheus (eg: try http://prometheus-server) please put them here...? if not prometheus_ip_address or not prometheus_port: print("ERROR: PROMETHEUS_URL was not set, and can not auto-detect where prometheus is") @@ -35,7 +39,7 @@ def detectPrometheusURL(): PROMETHEUS_VERSION = "Unknown" # Used to detect the availability of a new function called present_over_time only available on Prometheus v2.30.0 or newer, this is auto-detected and updated, not set by a user VERBOSE = True if getenv('VERBOSE', "false").lower() == "true" else False # If we want to verbose mode -# This handler helps handle sigkill gracefully +# This handler helps handle sigint/term gracefully (not in the middle of an runloop) class GracefulKiller: kill_now = False def __init__(self): @@ -69,7 +73,7 @@ def printHeaderAndConfiguration(): print(" Volume Autoscaler - Configuration ") print("---------------------------------------------------------------") print(" Prometheus URL: {}".format(PROMETHEUS_URL)) - print(" Prometheus Version: {}".format(PROMETHEUS_VERSION)) + print(" Prometheus Version: {}{}".format(PROMETHEUS_VERSION," (upgrade to >= 2.30.0 to prevent some false positives)" if version.parse(PROMETHEUS_VERSION) < version.parse("2.30.0") else "")) print(" Prometheus Labels: {{{}}}".format(PROMETHEUS_LABEL_MATCH)) print(" Interval to query usage: every {} seconds".format(INTERVAL_TIME)) print(" Scale up after: {} intervals ({} seconds total)".format(SCALE_AFTER_INTERVALS, SCALE_AFTER_INTERVALS * INTERVAL_TIME)) @@ -247,6 +251,14 @@ def convert_pvc_to_simpler_dict(pvc): return_dict['storage_class'] = pvc.spec.storage_class_name except: return_dict['storage_class'] = "" + try: + return_dict['resource_version'] = pvc.metadata.resource_version + except: + return_dict['resource_version'] = "" + try: + return_dict['uid'] = pvc.metadata.uid + except: + return_dict['uid'] = "" # Set our defaults return_dict['last_resized_at'] = 0 @@ -324,6 +336,7 @@ def scale_up_pvc(namespace, name, new_size): print(e) return False + # Test if prometheus is accessible, and gets the build version so we know which function(s) are available or not, primarily for present_over_time below def testIfPrometheusIsAccessible(url): global PROMETHEUS_VERSION @@ -337,6 +350,7 @@ def testIfPrometheusIsAccessible(url): print(e) exit(-1) + # Get a list of PVCs from Prometheus with their metrics of disk usage def fetch_pvcs_from_prometheus(url, label_match=PROMETHEUS_LABEL_MATCH): @@ -357,15 +371,57 @@ def fetch_pvcs_from_prometheus(url, label_match=PROMETHEUS_LABEL_MATCH): return response_object['data']['result'] +# Describe an specific PVC +def describe_pvc(namespace, name, simple=False): + api_response = kubernetes_core_api.list_namespaced_persistent_volume_claim(namespace, limit=1, field_selector="metadata.name=" + name, timeout_seconds=HTTP_TIMEOUT) + # print(api_response) + for item in api_response.items: + # If the user wants pre-parsed, making it a bit easier to work with than a huge map of map of maps + if simple: + return convert_pvc_to_simpler_dict(item) + return item + raise Exception("No PVC found for {}:{}".format(namespace,name)) + +# Convert an PVC to an involved object for Kubernetes events +def get_involved_object_from_pvc(pvc): + return kubernetes.client.V1ObjectReference( + api_version="v1", + kind="PersistentVolumeClaim", + name=pvc.metadata.name, + namespace=pvc.metadata.namespace, + resource_version=pvc.metadata.resource_version, + uid=pvc.metadata.uid, + ) +# Send events to Kubernetes. This is used when we modify PVCs +def send_kubernetes_event(namespace, name, reason, message, type="Normal"): -# Unused at the moment -# def describe_pvc(namespace, name, simple=False): -# api_response = kubernetes_core_api.list_namespaced_persistent_volume_claim(namespace, limit=1, field_selector="metadata.name=" + name, timeout_seconds=HTTP_TIMEOUT) -# # print(api_response) -# for item in api_response.items: -# # If the user wants pre-parsed, making it a bit easier to work with than a huge map of map of maps -# if simple: -# return convert_pvc_to_simpler_dict(item) -# return item + try: + # Lookup our PVC + pvc = describe_pvc(namespace, name) + + # Generate our metadata and object relation for this event + involved_object = get_involved_object_from_pvc(pvc) + source = kubernetes.client.V1EventSource(component="volume-autoscaler") + metadata = kubernetes.client.V1ObjectMeta( + namespace=namespace, + name=name + ''.join([random.choice('123456789abcdef') for n in range(16)]), + ) + + # Generate our event body with the reason and message set + body = kubernetes.client.CoreV1Event( + involved_object=involved_object, + metadata=metadata, + reason=reason, + message=message, + type=type, + source=source, + first_timestamp=datetime.datetime.now(datetime.timezone.utc).isoformat() + ) + + api_response = kubernetes_core_api.create_namespaced_event(namespace, body, field_manager="volume_autoscaler") + except ApiException as e: + print("Exception when calling CoreV1Api->create_namespaced_event: %s\n" % e) + except: + traceback.print_exc() diff --git a/main.py b/main.py index f557f1e..50da7ca 100755 --- a/main.py +++ b/main.py @@ -2,7 +2,7 @@ import os import time from helpers import INTERVAL_TIME, PROMETHEUS_URL, DRY_RUN, VERBOSE -from helpers import convert_bytes_to_storage, scale_up_pvc, testIfPrometheusIsAccessible, describe_all_pvcs +from helpers import convert_bytes_to_storage, scale_up_pvc, testIfPrometheusIsAccessible, describe_all_pvcs, send_kubernetes_event from helpers import fetch_pvcs_from_prometheus, printHeaderAndConfiguration, calculateBytesToScaleTo, GracefulKiller import slack import sys, traceback @@ -143,28 +143,38 @@ # If we aren't dry-run, lets resize print(" RESIZING disk from {} to {}".format(convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']), convert_bytes_to_storage(resize_to_bytes))) + status_output = "to scale up `{}` by `{}%` from `{}` to `{}`, it was using more than `{}%` disk space over the last `{} seconds`".format( + volume_description, + pvcs_in_kubernetes[volume_description]['scale_up_percent'], + convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']), + convert_bytes_to_storage(resize_to_bytes), + pvcs_in_kubernetes[volume_description]['scale_above_percent'], + IN_MEMORY_STORAGE[volume_description] * INTERVAL_TIME + ) + # Send event that we're starting to request a resize + send_kubernetes_event( + name=volume_name, namespace=volume_namespace, reason="VolumeResizeRequested", + message="Requesting {}".format(status_output) + ) + if scale_up_pvc(volume_namespace, volume_name, resize_to_bytes): - status_output = "Successfully scaled up `{}` by `{}%` from `{}` to `{}`, it was using more than `{}%` disk space over the last `{} seconds`".format( - volume_description, - pvcs_in_kubernetes[volume_description]['scale_up_percent'], - convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']), - convert_bytes_to_storage(resize_to_bytes), - pvcs_in_kubernetes[volume_description]['scale_above_percent'], - IN_MEMORY_STORAGE[volume_description] * INTERVAL_TIME - ) + status_output = "Successfully requested {}".format(status_output) + # Print success to console print(status_output) + # Intentionally skipping sending an event to Kubernetes on success, the above event is enough for now until we detect if resize succeeded + # Print success to Slack if slack.SLACK_WEBHOOK_URL and len(slack.SLACK_WEBHOOK_URL) > 0: slack.send(status_output) else: - status_output = "FAILED Scaling up `{}` by `{}%` from `{}` to `{}`, it was using more than `{}%` disk space over the last `{} seconds`".format( - volume_description, - pvcs_in_kubernetes[volume_description]['scale_up_percent'], - convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']), - convert_bytes_to_storage(resize_to_bytes), - pvcs_in_kubernetes[volume_description]['scale_above_percent'], - IN_MEMORY_STORAGE[volume_description] * INTERVAL_TIME, - ) + status_output = "FAILED requesting {}".format(status_output) + # Print failure to console print(status_output) + # Print failure to Kubernetes Events + send_kubernetes_event( + name=volume_name, namespace=volume_namespace, reason="VolumeResizeRequestFailed", + message=status_output, type="Warning" + ) + # Print failure to Slack if SLACK_WEBHOOK_URL and len(SLACK_WEBHOOK_URL) > 0: slack.send(status_output, severity="error")