Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow scaling events to be logged during cluster validation #16354

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions cloudmock/gce/mockcompute/instance_group_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ func (c *instanceGroupManagerClient) List(ctx context.Context, project, zone str
return l, nil
}

func (c *instanceGroupManagerClient) ListErrors(ctx context.Context, project, zone, name string) ([]*compute.InstanceManagedByIgmError, error) {
return []*compute.InstanceManagedByIgmError{}, nil
}

func (c *instanceGroupManagerClient) ListManagedInstances(ctx context.Context, project, zone, name string) ([]*compute.ManagedInstance, error) {
var instances []*compute.ManagedInstance
return instances, nil
Expand Down
11 changes: 11 additions & 0 deletions pkg/cloudinstances/cloud_instance_group.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package cloudinstances
import (
"fmt"
"strings"
"time"

v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
Expand All @@ -35,11 +36,21 @@ type CloudInstanceGroup struct {
MinSize int
TargetSize int
MaxSize int
Events []ScalingEvent

// Raw allows for the implementer to attach an object, for tracking additional state
Raw interface{}
}

type ScalingEvent struct {
Timestamp time.Time
Description string
}

func (e ScalingEvent) String() string {
return fmt.Sprintf("%s: %s", e.Timestamp.Format(time.RFC3339), e.Description)
}

// NewCloudInstance creates a new CloudInstance
func (c *CloudInstanceGroup) NewCloudInstance(instanceId string, status string, node *v1.Node) (*CloudInstance, error) {
if instanceId == "" {
Expand Down
10 changes: 10 additions & 0 deletions pkg/validation/validate_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,16 @@ func (v *ValidationCluster) validateNodes(cloudGroups map[string]*cloudinstances
cloudGroup.TargetSize),
InstanceGroup: cloudGroup.InstanceGroup,
})
if klog.V(4).Enabled() {
// Log scaling events
klog.V(4).Infof("Scaling events for instance group: %v", cloudGroup.InstanceGroup.Name)
sort.Slice(cloudGroup.Events, func(i, j int) bool {
return cloudGroup.Events[i].Timestamp.Before(cloudGroup.Events[j].Timestamp)
})
for _, event := range cloudGroup.Events {
klog.Infof("%v", event)
}
}
}

for _, member := range allMembers {
Expand Down
5 changes: 2 additions & 3 deletions tests/e2e/scenarios/upgrade-ab/run-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,9 @@ sleep 120s

${CHANNELS} apply channel "$KOPS_STATE_STORE"/"${CLUSTER_NAME}"/addons/bootstrap-channel.yaml --yes -v4

"${KOPS_B}" rolling-update cluster
"${KOPS_B}" rolling-update cluster --yes --validation-timeout 30m
"${KOPS_B}" rolling-update cluster --yes --validation-timeout 30m -v 4

"${KOPS_B}" validate cluster
"${KOPS_B}" validate cluster -v 4

# Verify kubeconfig-a still works
kubectl get nodes -owide --kubeconfig="${KUBECONFIG_A}"
Expand Down
16 changes: 16 additions & 0 deletions upup/pkg/fi/cloudup/awsup/aws_cloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -1196,13 +1196,29 @@ func awsBuildCloudInstanceGroup(ctx context.Context, c AWSCloud, cluster *kops.C
return nil, fmt.Errorf("failed to fetch instances: %v", err)
}

scalingReq := &autoscaling.DescribeScalingActivitiesInput{
Copy link
Member

@justinsb justinsb Feb 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know whether this gets expensive, but I wonder if we should plumb through a flag here as to whether we want this information (or add a method or func to CloudInstanceGroup that could query it on-demand)

AutoScalingGroupName: g.AutoScalingGroupName,
}
scalingEvents := make([]cloudinstances.ScalingEvent, 0)
c.Autoscaling().DescribeScalingActivitiesPagesWithContext(ctx, scalingReq, func(p *autoscaling.DescribeScalingActivitiesOutput, lastPage bool) bool {
for _, activity := range p.Activities {
event := cloudinstances.ScalingEvent{
Timestamp: aws.TimeValue(activity.StartTime),
Description: aws.StringValue(activity.Description),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like there's some extra fields; I'm not sure if Description has all/most of the information, but you might consider including the Raw any field (which will then get printed!)

}
scalingEvents = append(scalingEvents, event)
}
return true
})

cg := &cloudinstances.CloudInstanceGroup{
HumanName: aws.StringValue(g.AutoScalingGroupName),
InstanceGroup: ig,
MinSize: int(aws.Int64Value(g.MinSize)),
TargetSize: int(aws.Int64Value(g.DesiredCapacity)),
MaxSize: int(aws.Int64Value(g.MaxSize)),
Raw: g,
Events: scalingEvents,
}

for _, i := range g.Instances {
Expand Down
12 changes: 12 additions & 0 deletions upup/pkg/fi/cloudup/gce/compute.go
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,7 @@ type InstanceGroupManagerClient interface {
Delete(project, zone, name string) (*compute.Operation, error)
Get(project, zone, name string) (*compute.InstanceGroupManager, error)
List(ctx context.Context, project, zone string) ([]*compute.InstanceGroupManager, error)
ListErrors(ctx context.Context, project, zone, name string) ([]*compute.InstanceManagedByIgmError, error)
ListManagedInstances(ctx context.Context, project, zone, name string) ([]*compute.ManagedInstance, error)
RecreateInstances(project, zone, name, id string) (*compute.Operation, error)
SetTargetPools(project, zone, name string, targetPools []string) (*compute.Operation, error)
Expand Down Expand Up @@ -709,6 +710,17 @@ func (c *instanceGroupManagerClientImpl) List(ctx context.Context, project, zone
return ms, nil
}

func (c *instanceGroupManagerClientImpl) ListErrors(ctx context.Context, project, zone, name string) ([]*compute.InstanceManagedByIgmError, error) {
Copy link
Member

@justinsb justinsb Feb 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aside: I sort of regret our having this layer in GCE, I'm not sure it adds much!

var igmErrors []*compute.InstanceManagedByIgmError
if err := c.srv.ListErrors(project, zone, name).Pages(ctx, func(page *compute.InstanceGroupManagersListErrorsResponse) error {
igmErrors = append(igmErrors, page.Items...)
return nil
}); err != nil {
return nil, err
}
return igmErrors, nil
}

func (c *instanceGroupManagerClientImpl) ListManagedInstances(ctx context.Context, project, zone, name string) ([]*compute.ManagedInstance, error) {
var instances []*compute.ManagedInstance
if err := c.srv.ListManagedInstances(project, zone, name).Pages(ctx, func(page *compute.InstanceGroupManagersListManagedInstancesResponse) error {
Expand Down
28 changes: 28 additions & 0 deletions upup/pkg/fi/cloudup/gce/instancegroups.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"fmt"
"hash/fnv"
"strings"
"time"

compute "google.golang.org/api/compute/v1"
v1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -153,13 +154,22 @@ func getCloudGroups(c GCECloud, cluster *kops.Cluster, instancegroups []*kops.In
continue
}

igmErrors, err := c.Compute().InstanceGroupManagers().ListErrors(ctx, project, zoneName, name)
if err != nil {
return nil, fmt.Errorf("getting MIG Errors: %v", err)
}
events, err := igmErrorsToScalingEvents(igmErrors)
if err != nil {
return nil, fmt.Errorf("error converting MIG errors to scaling events: %v", err)
}
g := &cloudinstances.CloudInstanceGroup{
HumanName: mig.Name,
InstanceGroup: ig,
MinSize: int(mig.TargetSize),
TargetSize: int(mig.TargetSize),
MaxSize: int(mig.TargetSize),
Raw: mig,
Events: events,
}
groups[mig.Name] = g

Expand Down Expand Up @@ -264,6 +274,24 @@ func matchInstanceGroup(mig *compute.InstanceGroupManager, c *kops.Cluster, inst
return matches[0], nil
}

func igmErrorsToScalingEvents(igmErrors []*compute.InstanceManagedByIgmError) ([]cloudinstances.ScalingEvent, error) {
events := make([]cloudinstances.ScalingEvent, 0)
for _, error := range igmErrors {
ts, err := time.Parse(time.RFC3339, error.Timestamp)
if err != nil {
return nil, err
}
if error.Error != nil {
activity := cloudinstances.ScalingEvent{
Timestamp: ts,
Description: error.Error.Message,
}
events = append(events, activity)
}
}
return events, nil
}

func addCloudInstanceData(cm *cloudinstances.CloudInstance, instance *compute.Instance) {
cm.MachineType = LastComponent(instance.MachineType)
cm.Status = instance.Status
Expand Down