Skip to content

Commit f880340

Browse files
author
Kubernetes Submit Queue
authored
Merge pull request kubernetes#43231 from csbell/service-race
Automatic merge from submit-queue [Federation] Fix deletion logic in service controller This is a regression from 1.5 exposed by cascading deletions. In order to apply updates, the service controller locks access to a cached service and spawns go routines without waiting for them. When updates and deletions arrive in quick succession, previous goroutines remain active and race with the deletion logic. Coupled with this, the service_helper was not re-evaluating the value of the DeletionTimestamp. Without this patch, federation will sometimes leak resources at destruction time about half the time. In e2e land, about 4-5 test runs cause service tests to eat up all global fwd-ing rules and in turn, every subsequent ingress test will fail until we manually clean up leaked resources. No possibility to go green in fed e2e until this is merged.
2 parents ae6a5d2 + 3769435 commit f880340

File tree

2 files changed

+14
-2
lines changed

2 files changed

+14
-2
lines changed

federation/pkg/federation-controller/service/service_helper.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,9 +131,13 @@ func (cc *clusterClientCache) syncService(key, clusterName string, clusterCache
131131
if isDeletion {
132132
// cachedService is not reliable here as
133133
// deleting cache is the last step of federation service deletion
134-
_, err := fedClient.Core().Services(cachedService.lastState.Namespace).Get(cachedService.lastState.Name, metav1.GetOptions{})
134+
service, err := fedClient.Core().Services(cachedService.lastState.Namespace).Get(cachedService.lastState.Name, metav1.GetOptions{})
135135
// rebuild service if federation service still exists
136136
if err == nil || !errors.IsNotFound(err) {
137+
if err == nil && service.DeletionTimestamp != nil {
138+
glog.V(4).Infof("Skipping sync of service %v in underlying clusters as it has already been marked for deletion", name)
139+
return nil
140+
}
137141
return sc.ensureClusterService(cachedService, clusterName, cachedService.appliedState, clusterCache.clientset)
138142
}
139143
}

federation/pkg/federation-controller/service/servicecontroller.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,10 @@ func wantsDNSRecords(service *v1.Service) bool {
487487
// update DNS records and update the service info with DNS entries to federation apiserver.
488488
// the function returns any error caught
489489
func (s *ServiceController) processServiceForCluster(cachedService *cachedService, clusterName string, service *v1.Service, client *kubeclientset.Clientset) error {
490+
if service.DeletionTimestamp != nil {
491+
glog.V(4).Infof("Service has already been marked for deletion %v", service.Name)
492+
return nil
493+
}
490494
glog.V(4).Infof("Process service %s/%s for cluster %s", service.Namespace, service.Name, clusterName)
491495
// Create or Update k8s Service
492496
err := s.ensureClusterService(cachedService, clusterName, service, client)
@@ -508,15 +512,19 @@ func (s *ServiceController) updateFederationService(key string, cachedService *c
508512
}
509513

510514
// handle available clusters one by one
511-
var hasErr bool
515+
hasErr := false
516+
var wg sync.WaitGroup
512517
for clusterName, cache := range s.clusterCache.clientMap {
518+
wg.Add(1)
513519
go func(cache *clusterCache, clusterName string) {
520+
defer wg.Done()
514521
err := s.processServiceForCluster(cachedService, clusterName, desiredService, cache.clientset)
515522
if err != nil {
516523
hasErr = true
517524
}
518525
}(cache, clusterName)
519526
}
527+
wg.Wait()
520528
if hasErr {
521529
// detail error has been dumped inside the loop
522530
return fmt.Errorf("Service %s/%s was not successfully updated to all clusters", desiredService.Namespace, desiredService.Name), retryable

0 commit comments

Comments
 (0)