From abfb6fe1f302dd9bf659ca9a6142fcfa110d9ed4 Mon Sep 17 00:00:00 2001 From: Dmytro Date: Fri, 14 Feb 2025 12:48:33 -0700 Subject: [PATCH] Add readme --- README.md | 155 +++++++++++++++++- ...teresisevaluation.yaml => evaluation.yaml} | 0 .../kustomization.yaml | 15 +- ...tedpnormloadindex.yaml => load-index.yaml} | 0 ...scalingnormalizer.yaml => normalizer.yaml} | 0 ...ssingtimepartition.yaml => partition.yaml} | 0 ...v1alpha1_prometheuspoll.yaml => poll.yaml} | 0 ...pha1_replicasetscaler.yaml => scaler.yaml} | 2 +- ...ershardmanager.yaml => shard-manager.yaml} | 2 +- ...autoscaler.json => argocd-autoscaler.json} | 0 10 files changed, 162 insertions(+), 12 deletions(-) rename config/default-scaling-strategy/{autoscaler_v1alpha1_mostwantedtwophasehysteresisevaluation.yaml => evaluation.yaml} (100%) rename config/default-scaling-strategy/{autoscaler_v1alpha1_weightedpnormloadindex.yaml => load-index.yaml} (100%) rename config/default-scaling-strategy/{autoscaler_v1alpha1_robustscalingnormalizer.yaml => normalizer.yaml} (100%) rename config/default-scaling-strategy/{autoscaler_v1alpha1_longestprocessingtimepartition.yaml => partition.yaml} (100%) rename config/default-scaling-strategy/{autoscaler_v1alpha1_prometheuspoll.yaml => poll.yaml} (100%) rename config/default-scaling-strategy/{autoscaler_v1alpha1_replicasetscaler.yaml => scaler.yaml} (92%) rename config/default-scaling-strategy/{autoscaler_v1alpha1_secrettypeclustershardmanager.yaml => shard-manager.yaml} (77%) rename grafana/{cluster-autoscaler.json => argocd-autoscaler.json} (100%) diff --git a/README.md b/README.md index 3972fc0..53871a8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,155 @@ # argocd-autoscaler -Autoscaling ArgoCD -This is work in progress.. check out design doc at [DESIGN.md](./DESIGN.md). +This controller can automatically partition shards (destination kubernetes clusters) to ArgoCD Application Controllers, +determine how many App Controller replicas is needed for that partitioning, and scale App Controller accordingly. + +There is three levels of resolution I can explain how this works and how to use it. + +## TL;DR + +Level one aka TL;DR version: it will look at prometheus metrics from App Controllers, +determine load index of each destination cluster, +and partition clusters to replicas in the most efficient way. +And, scale the App Controller accordingly too. You can install it using kustomize in three simple steps: + +1. Grab CRDs from [./config/crd](./config/crd) +1. Grab autoscaler from [./config/default](./config/default) +1. Grab our default oppinionated scaling strategy from [./config/default-scaling-strategy](./default-scaling-strategy) + +Sample `kustomization.yaml` that you may want to use: + +```yaml +namespace: argocd +resources: +- github.com/plumber-cd/argocd-autoscaler/config/crd?ref=vX.X.X +- github.com/plumber-cd/argocd-autoscaler/config/default?ref=vX.X.X +- github.com/plumber-cd/argocd-autoscaler/config/default-scaling-strategy?ref=vX.X.X +patches: +- target: + kind: Deployment + name: argocd-autoscaler + path: argocd-application-controller-remove-replicas-patch.yaml + patch: |- + - op: add + path: /spec/template/spec/containers/0/resources + value: + resources: + requests: + cpu: 10m + memory: 64Mi + limits: + memory: 128Mi +``` + +## Advanced configuration + +### Customize scaling strategy + +You can use [./config/defaul-scaling-strategy](./config/default-scaling-strategy) as a starting point. +The things you will likely want to customize are `poll.yaml` and `load-index.yaml` and `evaluation.yaml`. + +The `poll.yaml` uses `PrometheusPoll` and controls what queries are made to Prometheus. +Based on your preference, you may adjust what works for you. +Note that query must return a single value. + +The `load-index.yaml` uses `WeightedPNormLoadIndex` to calculate load index using normalized polling values with weights. +Here, you may want to adjust how much each individual metric contributes to the result. + +The `evaluation.yaml` uses `MostWantedTwoPhaseHysteresisEvaluation` to observe and promote partitioning. +Here, you may want to customize how long should it be observing before electing and applying a re-shuffle. + +Customization recommendations based on defaults are as follows. + +Generally speaking, you may want to follow default opinionated `quantile_over_time` sampling and maybe only modify +what are quantiles sampled on. +Otherwise - you want to keep in mind to keep sampling at high enough quantile and for longer ranges. +This helps to "remember" spikes for longer, +and have them reflected in the load index later. +You will receive a mostly flat load index, and that's what you are aiming for. +Otherwise, spikes will not inform overall partitioning decision in the evaluation phase, +and (provided that at idle all shards are at about the same rate of utilization overall) - you will most likely +end up with partitioning of one shard per one replica. + +It may be tempting to remove evaluation piece and use load index directly at the scaling, +if you want to make scaling more reactive and instant. +But doing that may result in App Controller restarts every poll cycle. + +There is a middle ground which is to still use evaluation, +use `max` over a minimal possible polling period to get latest values in queries, +and dial down evaluation stabilization period to much shorter value that you are comfortable with how often it can restart. + +You can use [./grafana/argocd-autoscaler.json](./grafana/argocd-autoscaler.json) dashboard to aid you in figuring out optimal values for you. +You can deploy everything but `scaler.yaml`, which will essentially make it a dry-run. +You'll see in Grafana what it would want to do under current scaling strategy without it actually doing anything. +For that, of course, you'd need to deploy a `ServiceMonitor` (see below). + + +### Monitor Autoscaler with Prometheus + +Instead of `github.com/plumber-cd/argocd-autoscaler/config/default` +you can use `github.com/plumber-cd/argocd-autoscaler/config/default-with-monitoring`. +That deploys additional metrics service and a service monitor. +Here's a sample `kustomization.yaml` that you may want to use: + +```yaml +namespace: argocd +resources: +- github.com/plumber-cd/argocd-autoscaler/config/crd?ref=vX.X.X +- github.com/plumber-cd/argocd-autoscaler/config/default-with-monitoring?ref=vX.X.X +- github.com/plumber-cd/argocd-autoscaler/config/default-scaling-strategy?ref=vX.X.X +patches: +- target: + kind: Deployment + name: argocd-autoscaler + path: argocd-application-controller-remove-replicas-patch.yaml + patch: |- + - op: add + path: /spec/template/spec/containers/0/resources + value: + resources: + requests: + cpu: 10m + memory: 64Mi + limits: + memory: 128Mi +- target: + kind: NetworkPolicy + name: argocd-autoscaler-allow-metrics-traffic + patch: |- + - op: replace + path: /spec/ingress/0/from/0/namespaceSelector/matchLabels + value: + kubernetes.io/metadata.name: +- target: + kind: ServiceMonitor + name: argocd-autoscaler + path: argocd-autoscaler-metrics-network-policy-patch.yaml + patch: |- + - op: add + path: /metadata/labels + value: + : +``` + +At [./grafana/argocd-autoscaler.json](./grafana/argocd-autoscaler.json) you may find a dashboard that I am using. + +There are also other dashboards generated by `kubebuilder` - runtime telemetry is good but custom metrics one is useless. + +### Secured Autoscaler with Prometheus + +Generally speaking, for securing communications you may want to use following: + +1. Network Policy that only allows Prometheus to access the `/metrics` endpoint. +1. Put `/metrics` endpoint behind authentication. +1. Apply TLS encryption for `/metrics` endpoint. + +Network policy is already included in the previous example. For the rest - you do need additional RBAC and CertManager. + +Once you make sure these prerequisites are deployed, you may want to use [./config/default-secured](./config/default-secured). + +## Hardcore + +I have a pretty detailed document at [./DESIGN.md](./DESIGN.md) that explains all the implementation details. +It can help you to customize everything and anything or hopefully even contribute more ideas that we can implement. +At the current stage of the project, we really aim to provide a framework rather than a end result solution. +Although we do include default opinionated scaling strategy - only you can define a strategy that would work best for yourself. diff --git a/config/default-scaling-strategy/autoscaler_v1alpha1_mostwantedtwophasehysteresisevaluation.yaml b/config/default-scaling-strategy/evaluation.yaml similarity index 100% rename from config/default-scaling-strategy/autoscaler_v1alpha1_mostwantedtwophasehysteresisevaluation.yaml rename to config/default-scaling-strategy/evaluation.yaml diff --git a/config/default-scaling-strategy/kustomization.yaml b/config/default-scaling-strategy/kustomization.yaml index cd39586..7f360c8 100644 --- a/config/default-scaling-strategy/kustomization.yaml +++ b/config/default-scaling-strategy/kustomization.yaml @@ -1,10 +1,9 @@ -## Append samples of your project ## resources: -- autoscaler_v1alpha1_secrettypeclustershardmanager.yaml -- autoscaler_v1alpha1_prometheuspoll.yaml -- autoscaler_v1alpha1_robustscalingnormalizer.yaml -- autoscaler_v1alpha1_weightedpnormloadindex.yaml -- autoscaler_v1alpha1_longestprocessingtimepartition.yaml -- autoscaler_v1alpha1_mostwantedtwophasehysteresisevaluation.yaml -- autoscaler_v1alpha1_replicasetscaler.yaml +- shard-manager.yaml +- poll.yaml +- normalizer.yaml +- load-index.yaml +- partition.yaml +- evaluation.yaml +- scaler.yaml # +kubebuilder:scaffold:manifestskustomizesamples diff --git a/config/default-scaling-strategy/autoscaler_v1alpha1_weightedpnormloadindex.yaml b/config/default-scaling-strategy/load-index.yaml similarity index 100% rename from config/default-scaling-strategy/autoscaler_v1alpha1_weightedpnormloadindex.yaml rename to config/default-scaling-strategy/load-index.yaml diff --git a/config/default-scaling-strategy/autoscaler_v1alpha1_robustscalingnormalizer.yaml b/config/default-scaling-strategy/normalizer.yaml similarity index 100% rename from config/default-scaling-strategy/autoscaler_v1alpha1_robustscalingnormalizer.yaml rename to config/default-scaling-strategy/normalizer.yaml diff --git a/config/default-scaling-strategy/autoscaler_v1alpha1_longestprocessingtimepartition.yaml b/config/default-scaling-strategy/partition.yaml similarity index 100% rename from config/default-scaling-strategy/autoscaler_v1alpha1_longestprocessingtimepartition.yaml rename to config/default-scaling-strategy/partition.yaml diff --git a/config/default-scaling-strategy/autoscaler_v1alpha1_prometheuspoll.yaml b/config/default-scaling-strategy/poll.yaml similarity index 100% rename from config/default-scaling-strategy/autoscaler_v1alpha1_prometheuspoll.yaml rename to config/default-scaling-strategy/poll.yaml diff --git a/config/default-scaling-strategy/autoscaler_v1alpha1_replicasetscaler.yaml b/config/default-scaling-strategy/scaler.yaml similarity index 92% rename from config/default-scaling-strategy/autoscaler_v1alpha1_replicasetscaler.yaml rename to config/default-scaling-strategy/scaler.yaml index a6ce607..e3ae3cb 100644 --- a/config/default-scaling-strategy/autoscaler_v1alpha1_replicasetscaler.yaml +++ b/config/default-scaling-strategy/scaler.yaml @@ -16,4 +16,4 @@ spec: replicaSetControllerRef: apiGroup: apps kind: StatefulSet - name: default + name: argocd-application-controller diff --git a/config/default-scaling-strategy/autoscaler_v1alpha1_secrettypeclustershardmanager.yaml b/config/default-scaling-strategy/shard-manager.yaml similarity index 77% rename from config/default-scaling-strategy/autoscaler_v1alpha1_secrettypeclustershardmanager.yaml rename to config/default-scaling-strategy/shard-manager.yaml index f32cd83..d83f6b9 100644 --- a/config/default-scaling-strategy/autoscaler_v1alpha1_secrettypeclustershardmanager.yaml +++ b/config/default-scaling-strategy/shard-manager.yaml @@ -3,5 +3,5 @@ kind: SecretTypeClusterShardManager metadata: labels: app.kubernetes.io/name: argocd-autoscaler - name: secrettypeclustershardmanager-sample + name: default spec: {} diff --git a/grafana/cluster-autoscaler.json b/grafana/argocd-autoscaler.json similarity index 100% rename from grafana/cluster-autoscaler.json rename to grafana/argocd-autoscaler.json