Skip to content

Commit 966c44d

Browse files
committed
taskresource: wait for execution role credentials upon agent restart
1 parent 2a3bcad commit 966c44d

22 files changed

+459
-30
lines changed

agent/engine/dependencygraph/graph.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,8 @@ const (
4545
)
4646

4747
var (
48-
// CredentialsNotResolvedErr is the error where a container needs to wait for
49-
// credentials before it can process by agent
50-
CredentialsNotResolvedErr = &dependencyError{err: errors.New("dependency graph: container execution credentials not available")}
48+
// CredentialsNotResolvedErr is the error when a task needs to wait for credentials before it can be progressed to its desired status by the agent
49+
CredentialsNotResolvedErr = &dependencyError{err: errors.New("dependency graph: execution role credentials not available")}
5150
// DependentContainerNotResolvedErr is the error where a dependent container isn't in expected state
5251
DependentContainerNotResolvedErr = &dependencyError{err: errors.New("dependency graph: dependent container not in expected state")}
5352
// ContainerPastDesiredStatusErr is the error where the container status is bigger than desired status

agent/engine/engine_sudo_linux_integ_test.go

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -868,9 +868,19 @@ func TestGMSATaskFileS3Err(t *testing.T) {
868868
cfg.GMSACapable = config.BooleanDefaultFalse{Value: config.ExplicitlyEnabled}
869869
cfg.AWSRegion = "us-west-2"
870870

871-
taskEngine, done, _ := setupGMSALinux(cfg, nil, t)
871+
taskEngine, done, credentialsManager := setupGMSALinux(cfg, nil, t)
872872
defer done()
873873

874+
// mock execution role credentials
875+
mockCreds := &credentials.TaskIAMRoleCredentials{
876+
ARN: "testGMSAFileTaskARN",
877+
IAMRoleCredentials: credentials.IAMRoleCredentials{
878+
RoleArn: "arn:aws:iam::123456789012:role/execution-role",
879+
CredentialsID: "exec-creds-id",
880+
},
881+
}
882+
credentialsManager.SetTaskCredentials(mockCreds)
883+
874884
stateChangeEvents := taskEngine.StateChangeEvents()
875885

876886
testContainer := CreateTestContainer()
@@ -880,11 +890,12 @@ func TestGMSATaskFileS3Err(t *testing.T) {
880890
testContainer.DockerConfig.HostConfig = &hostConfig
881891

882892
testTask := &apitask.Task{
883-
Arn: "testGMSAFileTaskARN",
884-
Family: "family",
885-
Version: "1",
886-
DesiredStatusUnsafe: apitaskstatus.TaskRunning,
887-
Containers: []*apicontainer.Container{testContainer},
893+
Arn: "testGMSAFileTaskARN",
894+
Family: "family",
895+
Version: "1",
896+
DesiredStatusUnsafe: apitaskstatus.TaskRunning,
897+
Containers: []*apicontainer.Container{testContainer},
898+
ExecutionCredentialsID: "exec-creds-id",
888899
}
889900
testTask.Containers[0].TransitionDependenciesMap = make(map[apicontainerstatus.ContainerStatus]apicontainer.TransitionDependencySet)
890901
testTask.ResourcesMapUnsafe = make(map[string][]taskresource.TaskResource)
@@ -908,9 +919,19 @@ func TestGMSATaskFileSSMErr(t *testing.T) {
908919
cfg.GMSACapable = config.BooleanDefaultFalse{Value: config.ExplicitlyEnabled}
909920
cfg.AWSRegion = "us-west-2"
910921

911-
taskEngine, done, _ := setupGMSALinux(cfg, nil, t)
922+
taskEngine, done, credentialsManager := setupGMSALinux(cfg, nil, t)
912923
defer done()
913924

925+
// mock execution role credentials
926+
mockCreds := &credentials.TaskIAMRoleCredentials{
927+
ARN: "testGMSAFileTaskARN",
928+
IAMRoleCredentials: credentials.IAMRoleCredentials{
929+
RoleArn: "arn:aws:iam::123456789012:role/execution-role",
930+
CredentialsID: "exec-creds-id",
931+
},
932+
}
933+
credentialsManager.SetTaskCredentials(mockCreds)
934+
914935
stateChangeEvents := taskEngine.StateChangeEvents()
915936

916937
testContainer := CreateTestContainer()
@@ -920,11 +941,12 @@ func TestGMSATaskFileSSMErr(t *testing.T) {
920941
testContainer.DockerConfig.HostConfig = &hostConfig
921942

922943
testTask := &apitask.Task{
923-
Arn: "testGMSAFileTaskARN",
924-
Family: "family",
925-
Version: "1",
926-
DesiredStatusUnsafe: apitaskstatus.TaskRunning,
927-
Containers: []*apicontainer.Container{testContainer},
944+
Arn: "testGMSAFileTaskARN",
945+
Family: "family",
946+
Version: "1",
947+
DesiredStatusUnsafe: apitaskstatus.TaskRunning,
948+
Containers: []*apicontainer.Container{testContainer},
949+
ExecutionCredentialsID: "exec-creds-id",
928950
}
929951
testTask.Containers[0].TransitionDependenciesMap = make(map[apicontainerstatus.ContainerStatus]apicontainer.TransitionDependencySet)
930952
testTask.ResourcesMapUnsafe = make(map[string][]taskresource.TaskResource)

agent/engine/engine_unix_integ_test.go

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,22 @@ import (
3838
"github.com/aws/amazon-ecs-agent/agent/dockerclient"
3939
"github.com/aws/amazon-ecs-agent/agent/dockerclient/dockerapi"
4040
"github.com/aws/amazon-ecs-agent/agent/dockerclient/sdkclientfactory"
41+
mockssmfactory "github.com/aws/amazon-ecs-agent/agent/ssm/factory/mocks"
42+
mockssm "github.com/aws/amazon-ecs-agent/agent/ssm/mocks"
4143
"github.com/aws/amazon-ecs-agent/agent/statechange"
4244
"github.com/aws/amazon-ecs-agent/agent/taskresource"
4345
taskresourcevolume "github.com/aws/amazon-ecs-agent/agent/taskresource/volume"
4446
"github.com/aws/amazon-ecs-agent/agent/utils"
4547
apicontainerstatus "github.com/aws/amazon-ecs-agent/ecs-agent/api/container/status"
4648
apitaskstatus "github.com/aws/amazon-ecs-agent/ecs-agent/api/task/status"
49+
"github.com/aws/amazon-ecs-agent/ecs-agent/credentials"
50+
"github.com/aws/amazon-ecs-agent/ecs-agent/ipcompatibility"
4751
"github.com/aws/amazon-ecs-agent/ecs-agent/utils/ttime"
52+
"github.com/golang/mock/gomock"
4853

4954
"github.com/aws/aws-sdk-go-v2/aws"
55+
"github.com/aws/aws-sdk-go-v2/service/ssm"
56+
ssmtypes "github.com/aws/aws-sdk-go-v2/service/ssm/types"
5057
"github.com/cihub/seelog"
5158
"github.com/containerd/cgroups/v3"
5259
"github.com/docker/docker/api/types"
@@ -1908,3 +1915,107 @@ func TestHostResourceManagerLaunchTypeBehavior(t *testing.T) {
19081915
})
19091916
}
19101917
}
1918+
1919+
// TestTaskResourceDependencyOnRestart verifies that a task with resources properly waits for execution role credentials after agent restart, blocking
1920+
// until credentials arrive from ACS, then progressing to RUNNING.
1921+
// Note: although this test asserts functionality that is platform-agnostic, the test setup is such on Windows that we'd run into a timing issue.
1922+
// This is because the VerifyManifestPulledStateChange helper methods wait for events on the task engine's state channels,
1923+
// but on Windows, these are skipped due to no local registry setup, so the test progresses to its assertions faster than intended.
1924+
func TestTaskResourceDependencyOnRestart(t *testing.T) {
1925+
ctrl := gomock.NewController(t)
1926+
defer ctrl.Finish()
1927+
1928+
taskArn := "arn:aws:ecs:us-east-1:123456789012:task/testTaskResourceDependencyOnRestart"
1929+
testCredentialsID := "test-execution-credentials-id"
1930+
secretValueFrom := "arn:aws:ssm:us-east-1:123456789012:parameter/test-secret"
1931+
secretValue := "mock-secret-value"
1932+
1933+
// Create a task with a SSM secret resource dependency
1934+
testTask := CreateTestTask(taskArn)
1935+
testTask.Containers[0].Image = "busybox:latest" // Override container image
1936+
testTask.Containers[0].Secrets = []apicontainer.Secret{
1937+
{
1938+
Name: "test-secret",
1939+
ValueFrom: secretValueFrom,
1940+
Provider: "ssm",
1941+
Region: "us-east-1",
1942+
},
1943+
}
1944+
testTask.Containers[0].TransitionDependenciesMap = make(map[apicontainerstatus.ContainerStatus]apicontainer.TransitionDependencySet)
1945+
testTask.ResourcesMapUnsafe = make(map[string][]taskresource.TaskResource)
1946+
testTask.SetExecutionRoleCredentialsID(testCredentialsID)
1947+
1948+
// Start engine with default config
1949+
taskEngine, done, _, credentialsManager := setupWithDefaultConfig(t)
1950+
defer done()
1951+
1952+
// Set up SSM mocks
1953+
ssmClientCreator := mockssmfactory.NewMockSSMClientCreator(ctrl)
1954+
mockSSMClient := mockssm.NewMockSSMClient(ctrl)
1955+
mockCreds := &credentials.TaskIAMRoleCredentials{
1956+
ARN: taskArn,
1957+
IAMRoleCredentials: credentials.IAMRoleCredentials{
1958+
RoleArn: "arn:aws:iam::123456789012:role/execution-role",
1959+
CredentialsID: testCredentialsID,
1960+
},
1961+
}
1962+
ssmOutput := &ssm.GetParametersOutput{
1963+
Parameters: []ssmtypes.Parameter{
1964+
{
1965+
Name: aws.String(secretValueFrom),
1966+
Value: aws.String(secretValue),
1967+
},
1968+
},
1969+
}
1970+
ssmClientCreator.EXPECT().NewSSMClient("us-east-1", mockCreds.IAMRoleCredentials, ipcompatibility.NewIPv4OnlyCompatibility()).Return(mockSSMClient, nil).AnyTimes()
1971+
mockSSMClient.EXPECT().GetParameters(gomock.Any(), gomock.Any()).Do(func(ctx context.Context, in *ssm.GetParametersInput, optFns ...func(*ssm.Options)) {
1972+
require.Equal(t, []string{secretValueFrom}, in.Names)
1973+
}).Return(ssmOutput, nil).AnyTimes()
1974+
1975+
// Configure engine with SSM client creator
1976+
taskEngine.(*DockerTaskEngine).resourceFields = &taskresource.ResourceFields{
1977+
ResourceFieldsCommon: &taskresource.ResourceFieldsCommon{
1978+
SSMClientCreator: ssmClientCreator,
1979+
CredentialsManager: credentialsManager,
1980+
},
1981+
}
1982+
1983+
// Add task to engine without credentials
1984+
go taskEngine.AddTask(testTask)
1985+
1986+
// Verify task progresses but gets blocked waiting for execution credentials
1987+
VerifyContainerManifestPulledStateChange(t, taskEngine)
1988+
VerifyTaskManifestPulledStateChange(t, taskEngine)
1989+
1990+
// Verify the task has SSM resource that requires execution credentials
1991+
resources := testTask.GetResources()
1992+
var ssmResource taskresource.TaskResource
1993+
for _, resource := range resources {
1994+
if resource.GetName() == "ssmsecret" {
1995+
ssmResource = resource
1996+
break
1997+
}
1998+
}
1999+
require.NotNil(t, ssmResource, "SSM resource should exist")
2000+
require.True(t, ssmResource.RequiresExecutionRoleCredentials(), "SSM resource should require execution credentials")
2001+
2002+
// Mock credentials arrival from ACS
2003+
credentialsManager.SetTaskCredentials(mockCreds)
2004+
2005+
// Mimic ACS payload responder: after setting credentials, call AddTask to trigger task re-evaluation
2006+
// This is how blocked tasks get unblocked when credentials arrive from ACS
2007+
go taskEngine.AddTask(testTask)
2008+
2009+
// Verify task progresses through expected states after credentials are available
2010+
VerifyContainerRunningStateChange(t, taskEngine)
2011+
VerifyTaskRunningStateChange(t, taskEngine)
2012+
2013+
// Stop the task
2014+
taskUpdate := CreateTestTask("testTaskResourceDependencyOnRestart")
2015+
taskUpdate.Arn = taskArn
2016+
taskUpdate.SetDesiredStatus(apitaskstatus.TaskStopped)
2017+
go taskEngine.AddTask(taskUpdate)
2018+
2019+
VerifyContainerStoppedStateChange(t, taskEngine)
2020+
VerifyTaskStoppedStateChange(t, taskEngine)
2021+
}

agent/engine/task_manager.go

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,8 @@ import (
4848
)
4949

5050
const (
51-
// waitForPullCredentialsTimeout is the timeout agent trying to wait for pull
52-
// credentials from acs, after the timeout it will check the credentials manager
53-
// and start processing the task or start another round of waiting
54-
waitForPullCredentialsTimeout = 1 * time.Minute
51+
// waitForExecutionRoleCredentialsTimeout is the timeout when waiting for execution role credentials to arrive from ACS
52+
waitForExecutionRoleCredentialsTimeout = 1 * time.Minute
5553
systemPingTimeout = 5 * time.Second
5654
defaultTaskSteadyStatePollInterval = 5 * time.Minute
5755
defaultTaskSteadyStatePollIntervalJitter = 30 * time.Second
@@ -1070,7 +1068,7 @@ func (mtask *managedTask) progressTask() {
10701068
// task may be moved to stopped.
10711069
// anyResourceTransition is set to true when transition function needs to be called or
10721070
// known status can be changed
1073-
anyResourceTransition, resTransitions := mtask.startResourceTransitions(
1071+
anyResourceTransition, resTransitions, resourceReasons := mtask.startResourceTransitions(
10741072
func(resource taskresource.TaskResource, nextStatus resourcestatus.ResourceStatus) {
10751073
mtask.transitionResource(resource, nextStatus)
10761074
transitionChange <- struct{}{}
@@ -1092,7 +1090,9 @@ func (mtask *managedTask) progressTask() {
10921090
// its impossible for containers to move forward. We will do an additional check to see if we are waiting for ACS
10931091
// execution credentials. If not, then we will abort the task progression.
10941092
if !atLeastOneTransitionStarted && !blockedByOrderingDependencies {
1095-
if !mtask.isWaitingForACSExecutionCredentials(reasons) {
1093+
// Combine reasons from both container and resource transitions
1094+
allReasons := append(reasons, resourceReasons...)
1095+
if !mtask.isWaitingForACSExecutionCredentials(allReasons) {
10961096
mtask.handleContainersUnableToTransitionState()
10971097
}
10981098
return
@@ -1143,16 +1143,16 @@ func (mtask *managedTask) progressTask() {
11431143
func (mtask *managedTask) isWaitingForACSExecutionCredentials(reasons []error) bool {
11441144
for _, reason := range reasons {
11451145
if reason == dependencygraph.CredentialsNotResolvedErr {
1146-
logger.Info("Waiting for credentials to pull from ECR", logger.Fields{
1146+
logger.Info("Waiting for execution role credentials to arrive from ACS", logger.Fields{
11471147
field.TaskID: mtask.GetID(),
11481148
})
11491149

1150-
timeoutCtx, timeoutCancel := context.WithTimeout(mtask.ctx, waitForPullCredentialsTimeout)
1150+
timeoutCtx, timeoutCancel := context.WithTimeout(mtask.ctx, waitForExecutionRoleCredentialsTimeout)
11511151
defer timeoutCancel()
11521152

11531153
timedOut := mtask.waitEvent(timeoutCtx.Done())
11541154
if timedOut {
1155-
logger.Info("Timed out waiting for acs credentials message", logger.Fields{
1155+
logger.Info("Timed out waiting for execution role credentials to arrive from ACS", logger.Fields{
11561156
field.TaskID: mtask.GetID(),
11571157
})
11581158
}
@@ -1244,8 +1244,9 @@ func (mtask *managedTask) handleTerminalDependencyError(container *apicontainer.
12441244

12451245
// startResourceTransitions steps through each resource in the task and calls
12461246
// the passed transition function when a transition should occur
1247-
func (mtask *managedTask) startResourceTransitions(transitionFunc resourceTransitionFunc) (bool, map[string]string) {
1247+
func (mtask *managedTask) startResourceTransitions(transitionFunc resourceTransitionFunc) (bool, map[string]string, []error) {
12481248
anyCanTransition := false
1249+
var reasons []error
12491250
transitions := make(map[string]string)
12501251
for _, res := range mtask.GetResources() {
12511252
knownStatus := res.GetKnownStatus()
@@ -1260,6 +1261,19 @@ func (mtask *managedTask) startResourceTransitions(transitionFunc resourceTransi
12601261
})
12611262
continue
12621263
}
1264+
1265+
// Check if resource requires execution role credentials and whether they're available
1266+
if !mtask.taskExecutionRoleCredentialsResolved(res) {
1267+
logger.Info("Can't transition resource due to missing execution role credentials", logger.Fields{
1268+
field.TaskID: mtask.GetID(),
1269+
field.Resource: res.GetName(),
1270+
field.KnownStatus: res.StatusString(knownStatus),
1271+
field.DesiredStatus: res.StatusString(desiredStatus),
1272+
})
1273+
reasons = append(reasons, dependencygraph.CredentialsNotResolvedErr)
1274+
continue
1275+
}
1276+
12631277
anyCanTransition = true
12641278
transition := mtask.resourceNextState(res)
12651279
// If the resource is already in a transition, skip
@@ -1281,7 +1295,24 @@ func (mtask *managedTask) startResourceTransitions(transitionFunc resourceTransi
12811295
transitions[res.GetName()] = transition.status
12821296
go transitionFunc(res, transition.nextState)
12831297
}
1284-
return anyCanTransition, transitions
1298+
return anyCanTransition, transitions, reasons
1299+
}
1300+
1301+
// taskExecutionRoleCredentialsResolved checks if execution credentials are available for a task resource
1302+
func (mtask *managedTask) taskExecutionRoleCredentialsResolved(resource taskresource.TaskResource) bool {
1303+
// If resource doesn't need execution role credentials, it's always resolved
1304+
if !resource.RequiresExecutionRoleCredentials() {
1305+
return true
1306+
}
1307+
1308+
// If resource is already created, credentials were available when it was created
1309+
if resource.GetKnownStatus() >= resourcestatus.ResourceCreated {
1310+
return true
1311+
}
1312+
1313+
// Check if credentials are available
1314+
_, ok := mtask.credentialsManager.GetTaskCredentials(mtask.Task.GetExecutionCredentialsID())
1315+
return ok
12851316
}
12861317

12871318
// transitionResource calls applyResourceState, and then notifies the managed

0 commit comments

Comments
 (0)