Skip to content

Commit

Permalink
Add option to always schedule on external clusters (#1089)
Browse files Browse the repository at this point in the history
External clusters may have scaling behavior that will accommodate
any number of new installations. This change allows for ignoring
cluster resource thresholds on external clusters so that installations
are always scheduled on this cluster type.
  • Loading branch information
gabrieljackson authored Dec 10, 2024
1 parent b1b8f71 commit 09af48b
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 16 deletions.
1 change: 1 addition & 0 deletions cmd/cloud/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ func executeServerCmd(flags serverFlags) error {
installationScheduling := supervisor.NewInstallationSupervisorSchedulingOptions(
flags.balancedInstallationScheduling,
flags.preferScheduleOnStableClusters,
flags.alwaysScheduleExternalClusters,
flags.clusterResourceThreshold,
flags.thresholdCPUOverride,
flags.thresholdMemoryOverride,
Expand Down
2 changes: 2 additions & 0 deletions cmd/cloud/server_flag.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ func (flags *supervisorOptions) addFlags(command *cobra.Command) {
type schedulingOptions struct {
balancedInstallationScheduling bool
preferScheduleOnStableClusters bool
alwaysScheduleExternalClusters bool
clusterResourceThresholdScaleValue int
clusterResourceThreshold int
thresholdCPUOverride int
Expand All @@ -62,6 +63,7 @@ type schedulingOptions struct {
func (flags *schedulingOptions) addFlags(command *cobra.Command) {
command.Flags().BoolVar(&flags.balancedInstallationScheduling, "balanced-installation-scheduling", true, "Whether to schedule installations on the cluster with the greatest percentage of available resources or not. (slows down scheduling speed as cluster count increases)")
command.Flags().BoolVar(&flags.preferScheduleOnStableClusters, "prefer-stable-cluster-installation-scheduling", false, "Whether to prioritize scheduling installations on the clusters in the stable state or not. (can slow scheduling speed as cluster count increases)")
command.Flags().BoolVar(&flags.alwaysScheduleExternalClusters, "always-schedule-external-clusters", false, "Whether to always schedule installations on the clusters that are externally managed by ignoring resource thresholds.")
command.Flags().IntVar(&flags.clusterResourceThresholdScaleValue, "cluster-resource-threshold-scale-value", 0, "The number of worker nodes to scale up by when the threshold is passed. Set to 0 for no scaling. Scaling will never exceed the cluster max worker configuration value.")
command.Flags().IntVar(&flags.clusterResourceThreshold, "cluster-resource-threshold", 80, "The percent threshold where new installations won't be scheduled on a multi-tenant cluster.")
command.Flags().IntVar(&flags.thresholdCPUOverride, "cluster-resource-threshold-cpu-override", 0, "The cluster-resource-threshold override value for CPU resources only")
Expand Down
16 changes: 13 additions & 3 deletions internal/supervisor/installation.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ type InstallationSupervisorCache struct {
type InstallationSupervisorSchedulingOptions struct {
BalanceInstallations bool
PreferScheduleOnStableClusters bool
AlwaysScheduleExternalClusters bool
ClusterResourceThresholdCPU int
ClusterResourceThresholdMemory int
ClusterResourceThresholdPodCount int
Expand Down Expand Up @@ -169,10 +170,11 @@ func NewInstallationSupervisor(
}

// NewInstallationSupervisorSchedulingOptions creates a new InstallationSupervisorSchedulingOptions.
func NewInstallationSupervisorSchedulingOptions(balanceInstallations, preferStableClusters bool, clusterResourceThreshold, thresholdCPUOverride, thresholdMemoryOverride, thresholdPodCountOverride, clusterResourceThresholdScaleValue int) InstallationSupervisorSchedulingOptions {
func NewInstallationSupervisorSchedulingOptions(balanceInstallations, preferStableClusters, alwaysScheduleExternalClusters bool, clusterResourceThreshold, thresholdCPUOverride, thresholdMemoryOverride, thresholdPodCountOverride, clusterResourceThresholdScaleValue int) InstallationSupervisorSchedulingOptions {
schedulingOptions := InstallationSupervisorSchedulingOptions{
BalanceInstallations: balanceInstallations,
PreferScheduleOnStableClusters: preferStableClusters,
AlwaysScheduleExternalClusters: alwaysScheduleExternalClusters,
ClusterResourceThresholdCPU: clusterResourceThreshold,
ClusterResourceThresholdMemory: clusterResourceThreshold,
ClusterResourceThresholdPodCount: clusterResourceThreshold,
Expand Down Expand Up @@ -599,9 +601,17 @@ func (s *InstallationSupervisor) createClusterInstallation(cluster *model.Cluste
memoryPercent := clusterResources.CalculateMemoryPercentUsed(installationMemRequirement)
podPercent := clusterResources.CalculatePodCountPercentUsed(installationPodCountRequirement)

if cpuPercent > s.scheduling.ClusterResourceThresholdCPU ||
// Determine if a resource check should be performed.
performResourceCheck := true
if cluster.IsExternallyManaged() && s.scheduling.AlwaysScheduleExternalClusters {
performResourceCheck = false
}

resourcesOverThreshold := cpuPercent > s.scheduling.ClusterResourceThresholdCPU ||
memoryPercent > s.scheduling.ClusterResourceThresholdMemory ||
podPercent > s.scheduling.ClusterResourceThresholdPodCount {
podPercent > s.scheduling.ClusterResourceThresholdPodCount

if performResourceCheck && resourcesOverThreshold {

var provisionerMetadata model.ProvisionerMetadata
if cluster.Provisioner == model.ProvisionerKops {
Expand Down
70 changes: 57 additions & 13 deletions internal/supervisor/installation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ func (m *mockCloudflareClient) DeleteDNSRecords(customerDNSName []string, logger
}

func TestInstallationSupervisorDo(t *testing.T) {
standardSchedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(false, false, 80, 0, 0, 0, 0)
standardSchedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(false, false, false, 80, 0, 0, 0, 0)
require.NoError(t, standardSchedulingOptions.Validate())

t.Run("no installations pending work", func(t *testing.T) {
Expand Down Expand Up @@ -733,7 +733,7 @@ func TestInstallationSupervisorDo(t *testing.T) {
}

func TestInstallationSupervisor(t *testing.T) {
standardSchedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(false, false, 80, 0, 0, 0, 0)
standardSchedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(false, false, false, 80, 0, 0, 0, 0)
require.NoError(t, standardSchedulingOptions.Validate())

expectInstallationState := func(t *testing.T, sqlStore *store.SQLStore, installation *model.Installation, expectedState string) {
Expand Down Expand Up @@ -2495,7 +2495,7 @@ func TestInstallationSupervisor(t *testing.T) {
UsedPodCount: 100,
},
}
schedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(false, false, 80, 0, 0, 0, 2)
schedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(false, false, false, 80, 0, 0, 0, 2)
require.NoError(t, schedulingOptions.Validate())
supervisor := supervisor.NewInstallationSupervisor(
sqlStore,
Expand Down Expand Up @@ -2542,7 +2542,7 @@ func TestInstallationSupervisor(t *testing.T) {
sqlStore := store.MakeTestSQLStore(t, logger)
defer store.CloseConnection(t, sqlStore)

schedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 0, 0, 0)
schedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(true, false, false, 80, 0, 0, 0, 0)
require.NoError(t, schedulingOptions.Validate())
supervisor := supervisor.NewInstallationSupervisor(
sqlStore,
Expand Down Expand Up @@ -2726,9 +2726,39 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
}{
{
name: "valid, no overrides",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 0, 0, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, false, 80, 0, 0, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
PreferScheduleOnStableClusters: false,
AlwaysScheduleExternalClusters: false,
ClusterResourceThresholdCPU: 80,
ClusterResourceThresholdMemory: 80,
ClusterResourceThresholdPodCount: 80,
ClusterResourceThresholdScaleValue: 2,
},
expectError: false,
},
{
name: "valid, no overrides, prefer stable clusters",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, true, false, 80, 0, 0, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
PreferScheduleOnStableClusters: true,
AlwaysScheduleExternalClusters: false,
ClusterResourceThresholdCPU: 80,
ClusterResourceThresholdMemory: 80,
ClusterResourceThresholdPodCount: 80,
ClusterResourceThresholdScaleValue: 2,
},
expectError: false,
},
{
name: "valid, no overrides, always schedule external",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, true, 80, 0, 0, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
PreferScheduleOnStableClusters: false,
AlwaysScheduleExternalClusters: true,
ClusterResourceThresholdCPU: 80,
ClusterResourceThresholdMemory: 80,
ClusterResourceThresholdPodCount: 80,
Expand All @@ -2738,9 +2768,11 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "valid, cpu override",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 40, 0, 0, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, false, 80, 40, 0, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
PreferScheduleOnStableClusters: false,
AlwaysScheduleExternalClusters: false,
ClusterResourceThresholdCPU: 40,
ClusterResourceThresholdMemory: 80,
ClusterResourceThresholdPodCount: 80,
Expand All @@ -2750,9 +2782,11 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "valid, memory override",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 40, 0, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, false, 80, 0, 40, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
PreferScheduleOnStableClusters: false,
AlwaysScheduleExternalClusters: false,
ClusterResourceThresholdCPU: 80,
ClusterResourceThresholdMemory: 40,
ClusterResourceThresholdPodCount: 80,
Expand All @@ -2762,9 +2796,11 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "valid, pod count override",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 0, 40, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, false, 80, 0, 0, 40, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
PreferScheduleOnStableClusters: false,
AlwaysScheduleExternalClusters: false,
ClusterResourceThresholdCPU: 80,
ClusterResourceThresholdMemory: 80,
ClusterResourceThresholdPodCount: 40,
Expand All @@ -2774,9 +2810,11 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "invalid, no overrides",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, -1, 0, 0, 0, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, false, -1, 0, 0, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
PreferScheduleOnStableClusters: false,
AlwaysScheduleExternalClusters: false,
ClusterResourceThresholdCPU: -1,
ClusterResourceThresholdMemory: -1,
ClusterResourceThresholdPodCount: -1,
Expand All @@ -2786,9 +2824,11 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "invalid, cpu override",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 2, 0, 0, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, false, 80, 2, 0, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
PreferScheduleOnStableClusters: false,
AlwaysScheduleExternalClusters: false,
ClusterResourceThresholdCPU: 2,
ClusterResourceThresholdMemory: 80,
ClusterResourceThresholdPodCount: 80,
Expand All @@ -2798,7 +2838,7 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "invalid, memory override",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 2, 0, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, false, 80, 0, 2, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
ClusterResourceThresholdCPU: 80,
Expand All @@ -2810,9 +2850,11 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "invalid, pod count override",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 0, 2, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, false, 80, 0, 0, 2, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
PreferScheduleOnStableClusters: false,
AlwaysScheduleExternalClusters: false,
ClusterResourceThresholdCPU: 80,
ClusterResourceThresholdMemory: 80,
ClusterResourceThresholdPodCount: 2,
Expand All @@ -2822,9 +2864,11 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "invalid, scale value out of bounds",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 0, 0, -1),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, false, 80, 0, 0, 0, -1),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
PreferScheduleOnStableClusters: false,
AlwaysScheduleExternalClusters: false,
ClusterResourceThresholdCPU: 80,
ClusterResourceThresholdMemory: 80,
ClusterResourceThresholdPodCount: 80,
Expand Down
5 changes: 5 additions & 0 deletions model/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ func (c *Cluster) HasAWSInfrastructure() bool {
return true
}

// IsExternallyManaged returns if a cluster is externally managed.
func (c *Cluster) IsExternallyManaged() bool {
return c.Provider == ProviderExternal
}

func (c *Cluster) ApplyClusterUpdatePatch(patchRequest *UpdateClusterRequest) bool {
var applied bool
if patchRequest.Name != nil && *patchRequest.Name != c.Name {
Expand Down

0 comments on commit 09af48b

Please sign in to comment.