Skip to content

Commit dc17187

Browse files
authored
NLB based loadbalancer service support in eks deploy (#132)
* Add support for eks deploy with NLB based LoadBalancer Services * Fix error handling in drain and cordon commands
1 parent 7e12102 commit dc17187

File tree

10 files changed

+319
-120
lines changed

10 files changed

+319
-120
lines changed

commonerrors/commonerrors.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// Package commonerrors contains error types that are common across the project.
2+
package commonerrors
3+
4+
import "fmt"
5+
6+
// ImpossibleErr is returned for impossible conditions that should never happen in the code. This error should only be
7+
// returned if there is no user remedy and represents a bug in the code.
8+
type ImpossibleErr string
9+
10+
func (err ImpossibleErr) Error() string {
11+
return fmt.Sprintf(
12+
"You reached a point in kubergrunt that should not happen and is almost certainly a bug. Please open a GitHub issue on https://github.com/gruntwork-io/kubergrunt/issues with the contents of this error message. Code: %s",
13+
string(err),
14+
)
15+
}

eks/asg.go

Lines changed: 33 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,12 @@ import (
77
"github.com/aws/aws-sdk-go/service/autoscaling"
88
"github.com/aws/aws-sdk-go/service/ec2"
99
"github.com/aws/aws-sdk-go/service/elb"
10+
"github.com/aws/aws-sdk-go/service/elbv2"
1011
"github.com/gruntwork-io/go-commons/collections"
1112
"github.com/gruntwork-io/go-commons/errors"
13+
"github.com/hashicorp/go-multierror"
1214

15+
"github.com/gruntwork-io/kubergrunt/commonerrors"
1316
"github.com/gruntwork-io/kubergrunt/kubectl"
1417
"github.com/gruntwork-io/kubergrunt/logging"
1518
)
@@ -38,6 +41,7 @@ func scaleUp(
3841
asgSvc *autoscaling.AutoScaling,
3942
ec2Svc *ec2.EC2,
4043
elbSvc *elb.ELB,
44+
elbv2Svc *elbv2.ELBV2,
4145
kubectlOptions *kubectl.KubectlOptions,
4246
asgName string,
4347
desiredCapacity int64,
@@ -86,14 +90,14 @@ func scaleUp(
8690
logger.Errorf("Undo by terminating all the new instances and trying again")
8791
return err
8892
}
89-
elbNames, err := kubectl.GetLoadBalancerNames(kubectlOptions)
93+
elbs, err := kubectl.GetAWSLoadBalancers(kubectlOptions)
9094
if err != nil {
9195
logger.Errorf("Error retrieving associated ELB names of the Kubernetes services.")
9296
// TODO: can we use stages to pick up from here?
9397
logger.Errorf("Undo by terminating all the new instances and trying again")
9498
return err
9599
}
96-
err = waitForAnyInstancesRegisteredToELB(elbSvc, elbNames, newInstanceIds)
100+
err = waitForAnyInstancesRegisteredToELB(elbSvc, elbv2Svc, elbs, newInstanceIds)
97101
if err != nil {
98102
logger.Errorf("Timed out waiting for the instances to register to the Service ELBs.")
99103
// TODO: can we use stages to pick up from here?
@@ -242,10 +246,9 @@ func detachInstances(asgSvc *autoscaling.AutoScaling, asgName string, idList []s
242246
return nil
243247
}
244248

245-
// waitForAnyInstancesRegisteredToELB waits until any of the instances provided are registered to all the classic ELBs
246-
// provided. Classic ELB is what is used by the LoadBalancer Service resource in Kubernetes.
247-
// Here we wait for any instance to be registered, because we only need one instance to be registered to preserve
248-
// service uptime, due to the way Kubernetes works.
249+
// waitForAnyInstancesRegisteredToELB waits until any of the instances provided are registered to all the ELBs
250+
// provided. Here we wait for any instance to be registered, because we only need one instance to be registered to
251+
// preserve service uptime, due to the way Kubernetes works.
249252
// Pros:
250253
// - Shorter wait time.
251254
// - Can continue on to drain nodes succinctly, which is also time consuming.
@@ -255,36 +258,36 @@ func detachInstances(asgSvc *autoscaling.AutoScaling, asgName string, idList []s
255258
// - Not all instances are registered, so there is no "load balancing" initially. This may bring down the new server
256259
// that is launched.
257260
// Ultimately, it was decided that the cons are not worth the extended wait time it will introduce to the command.
258-
// TODO: Update this when:
259-
// - we support ALB ingress controllers
260-
// - NLB for LoadBalancer Service resource comes out of alpha
261-
func waitForAnyInstancesRegisteredToELB(elbSvc *elb.ELB, elbNames []string, instanceIds []string) error {
261+
func waitForAnyInstancesRegisteredToELB(elbSvc *elb.ELB, elbv2Svc *elbv2.ELBV2, elbs []kubectl.AWSLoadBalancer, instanceIds []string) error {
262262
logger := logging.GetProjectLogger()
263263
logger.Infof("Verifying new nodes are registered to external load balancers.")
264264

265-
instances := []*elb.Instance{}
266-
for _, instanceID := range instanceIds {
267-
instances = append(instances, &elb.Instance{InstanceId: aws.String(instanceID)})
268-
}
265+
var multipleErrs *multierror.Error
266+
for _, elb := range elbs {
267+
if elb.TargetType == kubectl.IPTarget {
268+
// We ignore ELBs of the IP type as those directly link to Pods and not instances.
269+
continue
270+
} else if elb.TargetType == kubectl.UnknownELBTarget {
271+
// This should never happen, so we return a generic error that indicates this is an impossible condition and
272+
// almost 100% a bug with kubergrunt.
273+
multipleErrs = multierror.Append(commonerrors.ImpossibleErr("UNKNOWN_ELB_TARGET_TYPE_IN_WAIT"))
274+
continue
275+
}
269276

270-
multipleErrs := NewMultipleLookupErrors()
271-
for _, elbName := range elbNames {
272-
logger.Infof("Waiting for at least one instance to be in service for elb %s", elbName)
273-
params := &elb.DescribeInstanceHealthInput{
274-
LoadBalancerName: aws.String(elbName),
275-
Instances: instances,
277+
var err error
278+
switch elb.Type {
279+
case kubectl.CLB:
280+
err = waitForAnyInstancesRegisteredToCLB(logger, elbSvc, elb.Name, instanceIds)
281+
case kubectl.NLB, kubectl.ALB:
282+
err = waitForAnyInstancesRegisteredToALBOrNLB(logger, elbv2Svc, elb.Name, instanceIds)
283+
default:
284+
// This should never happen, so we return a generic error that indicates this is an impossible condition and
285+
// almost 100% a bug with kubergrunt.
286+
err = commonerrors.ImpossibleErr("UNKNOWN_ELB_TYPE_IN_WAIT")
276287
}
277-
err := elbSvc.WaitUntilAnyInstanceInService(params)
278288
if err != nil {
279-
logger.Infof("ERROR: error waiting for any instance to be in service for elb %s", elbName)
280-
multipleErrs.AddError(err)
281-
} else {
282-
logger.Infof("At least one instance in service for elb %s", elbName)
289+
multipleErrs = multierror.Append(multipleErrs, err)
283290
}
284291
}
285-
if !multipleErrs.IsEmpty() {
286-
return multipleErrs
287-
}
288-
logger.Infof("All ELBs have at least one instance in service")
289-
return nil
292+
return multipleErrs.ErrorOrNil()
290293
}

eks/deploy.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"github.com/aws/aws-sdk-go/service/autoscaling"
99
"github.com/aws/aws-sdk-go/service/ec2"
1010
"github.com/aws/aws-sdk-go/service/elb"
11+
"github.com/aws/aws-sdk-go/service/elbv2"
1112
"github.com/gruntwork-io/go-commons/errors"
1213

1314
"github.com/gruntwork-io/kubergrunt/eksawshelper"
@@ -47,6 +48,7 @@ func RollOutDeployment(
4748
asgSvc := autoscaling.New(sess)
4849
ec2Svc := ec2.New(sess)
4950
elbSvc := elb.New(sess)
51+
elbv2Svc := elbv2.New(sess)
5052
logger.Infof("Successfully authenticated with AWS")
5153

5254
// Retrieve the ASG object and gather required info we will need later
@@ -89,6 +91,7 @@ func RollOutDeployment(
8991
asgSvc,
9092
ec2Svc,
9193
elbSvc,
94+
elbv2Svc,
9295
kubectlOptions,
9396
eksAsgName,
9497
originalCapacity*2,

eks/elb.go

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
package eks
2+
3+
import (
4+
"fmt"
5+
"time"
6+
7+
"github.com/aws/aws-sdk-go/aws"
8+
"github.com/aws/aws-sdk-go/service/elb"
9+
"github.com/aws/aws-sdk-go/service/elbv2"
10+
"github.com/gruntwork-io/go-commons/collections"
11+
"github.com/gruntwork-io/go-commons/errors"
12+
"github.com/gruntwork-io/go-commons/retry"
13+
"github.com/sirupsen/logrus"
14+
15+
"github.com/gruntwork-io/kubergrunt/commonerrors"
16+
)
17+
18+
// waitForAnyInstancesRegisteredToALBOrNLB implements the logic to wait for instance registration to Application and
19+
// Network Load Balancers. Refer to function docs for waitForAnyInstancesRegisteredToELB for more info.
20+
// NOTE: this assumes the ELB is using the instance target type.
21+
func waitForAnyInstancesRegisteredToALBOrNLB(logger *logrus.Entry, elbv2Svc *elbv2.ELBV2, lbName string, instanceIDsToWaitFor []string) error {
22+
targetGroup, err := getELBTargetGroup(elbv2Svc, lbName)
23+
if err != nil {
24+
return err
25+
}
26+
27+
// Retry up to 10 minutes with 15 second retry sleep
28+
waitErr := retry.DoWithRetry(
29+
logger.Logger,
30+
fmt.Sprintf(
31+
"wait for expected targets to be registered to target group %s of load balancer %s",
32+
aws.StringValue(targetGroup.TargetGroupName),
33+
lbName,
34+
),
35+
40, 15*time.Second,
36+
func() error {
37+
targetsResp, err := elbv2Svc.DescribeTargetHealth(&elbv2.DescribeTargetHealthInput{TargetGroupArn: targetGroup.TargetGroupArn})
38+
if err != nil {
39+
return retry.FatalError{Underlying: err}
40+
}
41+
42+
// Check each target to see if it is one of the instances we are waiting for, and return without error to
43+
// stop the retry loop if that is the case since condition is met.
44+
for _, targetHealth := range targetsResp.TargetHealthDescriptions {
45+
if targetHealth.Target == nil || targetHealth.Target.Id == nil {
46+
continue
47+
}
48+
instanceID := *targetHealth.Target.Id
49+
if collections.ListContainsElement(instanceIDsToWaitFor, instanceID) {
50+
return nil
51+
}
52+
}
53+
return fmt.Errorf("No expected instances registered yet")
54+
},
55+
)
56+
if fatalWaitErr, isFatalErr := waitErr.(retry.FatalError); isFatalErr {
57+
return errors.WithStackTrace(fatalWaitErr.Underlying)
58+
}
59+
return errors.WithStackTrace(waitErr)
60+
}
61+
62+
// waitForAnyInstancesRegisteredToCLB implements the logic to wait for instance registration to Classic Load Balancers.
63+
// Refer to function docs for waitForAnyInstancesRegisteredToELB for more info.
64+
func waitForAnyInstancesRegisteredToCLB(logger *logrus.Entry, elbSvc *elb.ELB, lbName string, instanceIds []string) error {
65+
instances := []*elb.Instance{}
66+
for _, instanceID := range instanceIds {
67+
instances = append(instances, &elb.Instance{InstanceId: aws.String(instanceID)})
68+
}
69+
70+
logger.Infof("Waiting for at least one instance to be in service for elb %s", lbName)
71+
params := &elb.DescribeInstanceHealthInput{
72+
LoadBalancerName: aws.String(lbName),
73+
Instances: instances,
74+
}
75+
err := elbSvc.WaitUntilAnyInstanceInService(params)
76+
if err != nil {
77+
logger.Errorf("error waiting for any instance to be in service for elb %s", lbName)
78+
return err
79+
}
80+
logger.Infof("At least one instance in service for elb %s", lbName)
81+
return nil
82+
}
83+
84+
// getELBTargetGroup looks up the associated TargetGroup of the given ELB. Note that this assumes:
85+
// - lbName refers to a v2 ELB (ALB or NLB)
86+
// - There is exactly one TargetGroup associated with the ELB (this is enforced by the Kubernetes controllers)
87+
func getELBTargetGroup(elbv2Svc *elbv2.ELBV2, lbName string) (*elbv2.TargetGroup, error) {
88+
resp, err := elbv2Svc.DescribeLoadBalancers(&elbv2.DescribeLoadBalancersInput{Names: aws.StringSlice([]string{lbName})})
89+
if err != nil {
90+
return nil, errors.WithStackTrace(err)
91+
}
92+
93+
if len(resp.LoadBalancers) == 0 {
94+
return nil, errors.WithStackTrace(CouldNotFindLoadBalancerErr{name: lbName})
95+
} else if len(resp.LoadBalancers) > 1 {
96+
// This condition is impossible because we are querying a single LB name and names are unique within regions.
97+
return nil, errors.WithStackTrace(commonerrors.ImpossibleErr("MORE_THAN_ONE_ELB_IN_LOOKUP"))
98+
} else if resp.LoadBalancers[0] == nil {
99+
return nil, errors.WithStackTrace(commonerrors.ImpossibleErr("ELB_IS_NULL_FROM_API"))
100+
}
101+
elb := resp.LoadBalancers[0]
102+
103+
targetGroupsResp, err := elbv2Svc.DescribeTargetGroups(&elbv2.DescribeTargetGroupsInput{LoadBalancerArn: elb.LoadBalancerArn})
104+
if err != nil {
105+
return nil, errors.WithStackTrace(err)
106+
}
107+
108+
if len(targetGroupsResp.TargetGroups) != 1 {
109+
// This is an impossible condition because the load balancer controllers always only creates a single target
110+
// group for the ELBs it provisions.
111+
return nil, errors.WithStackTrace(commonerrors.ImpossibleErr("ELB_HAS_UNEXPECTED_NUMBER_OF_TARGET_GROUPS"))
112+
}
113+
return targetGroupsResp.TargetGroups[0], nil
114+
}

eks/errors.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,3 +168,15 @@ func (err NetworkInterfaceDeletedTimeoutError) Error() string {
168168
err.networkInterfaceId,
169169
)
170170
}
171+
172+
// CouldNotFindLoadBalancerErr is returned when the given ELB can not be found.
173+
type CouldNotFindLoadBalancerErr struct {
174+
name string
175+
}
176+
177+
func (err CouldNotFindLoadBalancerErr) Error() string {
178+
return fmt.Sprintf(
179+
"Could not find ELB with name %s.",
180+
err.name,
181+
)
182+
}

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ require (
77
github.com/blang/semver/v4 v4.0.0
88
github.com/gruntwork-io/go-commons v0.8.2
99
github.com/gruntwork-io/terratest v0.32.9
10+
github.com/hashicorp/go-multierror v1.1.0
1011
github.com/mitchellh/go-homedir v1.1.0
1112
github.com/sirupsen/logrus v1.6.0
1213
github.com/stretchr/testify v1.6.1

kubectl/errors.go

Lines changed: 14 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -55,66 +55,12 @@ type NodeDrainError struct {
5555
NodeID string
5656
}
5757

58-
// NodeDrainErrors is returned when there are errors draining nodes concurrently. Each node that has an error is added
59-
// to the list.
60-
type NodeDrainErrors struct {
61-
errors []NodeDrainError
62-
}
63-
64-
func (err NodeDrainErrors) Error() string {
65-
base := "Multiple errors caught while draining a node:\n"
66-
for _, subErr := range err.errors {
67-
subErrMessage := fmt.Sprintf("Node %s: %s", subErr.NodeID, subErr.Error)
68-
base = base + subErrMessage + "\n"
69-
}
70-
return base
71-
}
72-
73-
func (err NodeDrainErrors) AddError(newErr NodeDrainError) {
74-
err.errors = append(err.errors, newErr)
75-
}
76-
77-
func (err NodeDrainErrors) IsEmpty() bool {
78-
return len(err.errors) == 0
79-
}
80-
81-
func NewNodeDrainErrors() NodeDrainErrors {
82-
return NodeDrainErrors{[]NodeDrainError{}}
83-
}
84-
8558
// NodeCordonError is returned when there is an error cordoning a node.
8659
type NodeCordonError struct {
8760
Error error
8861
NodeID string
8962
}
9063

91-
// NodeCordonErrors is returned when there are errors cordoning nodes concurrently. Each node that has an error is added
92-
// to the list.
93-
type NodeCordonErrors struct {
94-
errors []NodeCordonError
95-
}
96-
97-
func (err NodeCordonErrors) Error() string {
98-
base := "Multiple errors caught while cordoning nodes:\n"
99-
for _, subErr := range err.errors {
100-
subErrMessage := fmt.Sprintf("Node %s: %s", subErr.NodeID, subErr.Error)
101-
base = base + subErrMessage + "\n"
102-
}
103-
return base
104-
}
105-
106-
func (err NodeCordonErrors) AddError(newErr NodeCordonError) {
107-
err.errors = append(err.errors, newErr)
108-
}
109-
110-
func (err NodeCordonErrors) IsEmpty() bool {
111-
return len(err.errors) == 0
112-
}
113-
114-
func NewNodeCordonErrors() NodeCordonErrors {
115-
return NodeCordonErrors{[]NodeCordonError{}}
116-
}
117-
11864
// LoadBalancerNotReadyError is returned when the LoadBalancer Service is unexpectedly not ready.
11965
type LoadBalancerNotReadyError struct {
12066
serviceName string
@@ -154,3 +100,17 @@ func (err ProvisionIngressEndpointTimeoutError) Error() string {
154100
err.namespace,
155101
)
156102
}
103+
104+
// UnknownAWSLoadBalancerTypeErr is returned when we encounter a load balancer type that we don't expect/support.
105+
type UnknownAWSLoadBalancerTypeErr struct {
106+
typeKey string
107+
typeStr string
108+
}
109+
110+
func (err UnknownAWSLoadBalancerTypeErr) Error() string {
111+
return fmt.Sprintf(
112+
"Unknown value for annotation %s (value: %s)",
113+
err.typeKey,
114+
err.typeStr,
115+
)
116+
}

0 commit comments

Comments
 (0)