Skip to content

Commit e013322

Browse files
authored
Merge pull request containerd#10649 from samuelkarp/shim-exec-fp-test
integration: regression test for issue 10589
2 parents c8b095f + 18725f0 commit e013322

File tree

5 files changed

+490
-2
lines changed

5 files changed

+490
-2
lines changed

integration/client/container_linux_test.go

Lines changed: 209 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,18 @@ import (
3434
"github.com/containerd/cgroups/v3/cgroup1"
3535
cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2"
3636
"github.com/containerd/containerd/api/types/runc/options"
37+
"github.com/containerd/errdefs"
38+
"github.com/stretchr/testify/assert"
39+
3740
. "github.com/containerd/containerd/v2/client"
3841
"github.com/containerd/containerd/v2/core/containers"
42+
"github.com/containerd/containerd/v2/integration/failpoint"
3943
"github.com/containerd/containerd/v2/pkg/cio"
44+
"github.com/containerd/containerd/v2/pkg/fifosync"
4045
"github.com/containerd/containerd/v2/pkg/oci"
4146
"github.com/containerd/containerd/v2/pkg/shim"
4247
"github.com/containerd/containerd/v2/pkg/sys"
4348
"github.com/containerd/containerd/v2/plugins"
44-
"github.com/containerd/errdefs"
4549

4650
"github.com/opencontainers/runtime-spec/specs-go"
4751
"github.com/stretchr/testify/require"
@@ -1551,3 +1555,207 @@ func TestIssue9103(t *testing.T) {
15511555
})
15521556
}
15531557
}
1558+
1559+
// TestIssue10589 is used as regression case for issue 10589.
1560+
//
1561+
// This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates
1562+
// by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This
1563+
// allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist.
1564+
//
1565+
// The workflow is as follows:
1566+
// 1. Create a container as normal
1567+
// 2. Make an exec1 using runc-fp with delayexec
1568+
// 3. Wait until the exec is waiting to start (triggered by delayexec)
1569+
// 4. Kill the container init process (signalling it is easiest)
1570+
// 5. Make an exec2 using runc-fp with delayexec
1571+
// 6. Wait until the exec is waiting to start
1572+
// 7. Allow exec1 to proceed
1573+
// 8. Allow exec2 to proceed
1574+
// 9. See that the container has exited and all execs have exited too
1575+
//
1576+
// https://github.com/containerd/containerd/issues/10589
1577+
func TestIssue10589(t *testing.T) {
1578+
if f := os.Getenv("RUNC_FLAVOR"); f != "" && f != "runc" {
1579+
t.Skip("test requires runc")
1580+
}
1581+
1582+
client, err := newClient(t, address)
1583+
require.NoError(t, err)
1584+
t.Cleanup(func() {
1585+
client.Close()
1586+
})
1587+
1588+
var (
1589+
image Image
1590+
ctx, cancel = testContext(t)
1591+
id = t.Name()
1592+
)
1593+
t.Cleanup(cancel)
1594+
1595+
image, err = client.GetImage(ctx, testImage)
1596+
require.NoError(t, err)
1597+
1598+
// 1. Create a sleeping container
1599+
t.Log("1. Create a sleeping container")
1600+
container, err := client.NewContainer(ctx, id,
1601+
WithNewSnapshot(id, image),
1602+
WithNewSpec(oci.WithImageConfig(image),
1603+
withProcessArgs("sleep", "inf"),
1604+
oci.WithAnnotations(map[string]string{
1605+
"oci.runc.failpoint.profile": "delayExec",
1606+
}),
1607+
),
1608+
WithRuntime(client.Runtime(), &options.Options{
1609+
BinaryName: "runc-fp",
1610+
}),
1611+
)
1612+
require.NoError(t, err, "create container")
1613+
t.Cleanup(func() {
1614+
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
1615+
err := container.Delete(ctx, WithSnapshotCleanup)
1616+
if err != nil {
1617+
t.Log("delete err", err)
1618+
}
1619+
cancel()
1620+
})
1621+
1622+
task, err := container.NewTask(ctx, empty())
1623+
require.NoError(t, err, "create task")
1624+
t.Cleanup(func() {
1625+
ctx, cancel := context.WithTimeout(ctx, 2*time.Second)
1626+
st, err := task.Delete(ctx, WithProcessKill)
1627+
t.Log("exit status", st)
1628+
if err != nil {
1629+
t.Log("kill err", err)
1630+
}
1631+
cancel()
1632+
})
1633+
1634+
err = task.Start(ctx)
1635+
require.NoError(t, err, "start container")
1636+
1637+
status, err := task.Status(ctx)
1638+
require.NoError(t, err, "container status")
1639+
require.Equal(t, Running, status.Status)
1640+
1641+
// 2. Create an exec
1642+
t.Log("2. Create exec1")
1643+
exec1ReadyFifo, err := fifosync.NewWaiter(filepath.Join(t.TempDir(), "exec1-ready.fifo"), 0600)
1644+
require.NoError(t, err, "create exec1 ready fifo")
1645+
exec1DelayFifo, err := fifosync.NewTrigger(filepath.Join(t.TempDir(), "exec1-delay.fifo"), 0600)
1646+
require.NoError(t, err, "create exec1 delay fifo")
1647+
exec1, err := task.Exec(ctx, "exec1", &specs.Process{
1648+
Args: []string{"/bin/sleep", "301"},
1649+
Cwd: "/",
1650+
Env: []string{
1651+
failpoint.DelayExecReadyEnv + "=" + exec1ReadyFifo.Name(),
1652+
failpoint.DelayExecDelayEnv + "=" + exec1DelayFifo.Name(),
1653+
},
1654+
}, cio.NullIO)
1655+
require.NoError(t, err, "create exec1")
1656+
1657+
exec1done := make(chan struct{})
1658+
go func() {
1659+
defer close(exec1done)
1660+
t.Log("Starting exec1")
1661+
err := exec1.Start(ctx)
1662+
assert.Error(t, err, "start exec1")
1663+
t.Logf("error starting exec1: %s", err)
1664+
}()
1665+
1666+
// 3. Wait until the exec is waiting to start
1667+
t.Log("3. Wait until exec1 is waiting to start")
1668+
err = exec1ReadyFifo.Wait()
1669+
require.NoError(t, err, "open exec1 fifo")
1670+
1671+
// 4. Kill the container init process
1672+
t.Log("4. Kill the container init process")
1673+
target := task.Pid()
1674+
t.Logf("Killing main pid (%v) of container %s", target, container.ID())
1675+
syscall.Kill(int(target), syscall.SIGKILL)
1676+
status, err = task.Status(ctx)
1677+
require.NoError(t, err, "container status")
1678+
t.Log("container status", status.Status)
1679+
1680+
// 5. Make an exec (2) using this failpoint
1681+
t.Log("5. Create exec2")
1682+
exec2ReadyFifo, err := fifosync.NewWaiter(filepath.Join(t.TempDir(), "exec2-ready.fifo"), 0600)
1683+
require.NoError(t, err, "create exec2 ready fifo: %q", exec2ReadyFifo)
1684+
exec2DelayFifo, err := fifosync.NewTrigger(filepath.Join(t.TempDir(), "exec2-delay.fifo"), 0600)
1685+
require.NoError(t, err, "create exec2 delay fifo: %q", exec2DelayFifo)
1686+
exec2, err := task.Exec(ctx, "exec2", &specs.Process{
1687+
Args: []string{"/bin/sleep", "302"},
1688+
Cwd: "/",
1689+
Env: []string{
1690+
failpoint.DelayExecReadyEnv + "=" + exec2ReadyFifo.Name(),
1691+
failpoint.DelayExecDelayEnv + "=" + exec2DelayFifo.Name(),
1692+
},
1693+
}, cio.NullIO)
1694+
require.NoError(t, err, "create exec2")
1695+
1696+
exec2done := make(chan struct{})
1697+
didExec2Run := true
1698+
go func() {
1699+
defer close(exec2done)
1700+
t.Log("Starting exec2")
1701+
err := exec2.Start(ctx)
1702+
assert.Error(t, err, "start exec2")
1703+
t.Logf("error starting exec2: %s", err)
1704+
}()
1705+
1706+
// 6. Wait until the exec is waiting to start
1707+
t.Log("6. Wait until exec2 is waiting to start")
1708+
exec2ready := make(chan struct{})
1709+
go func() {
1710+
exec2ReadyFifo.Wait()
1711+
close(exec2ready)
1712+
}()
1713+
select {
1714+
case <-exec2ready:
1715+
case <-exec2done:
1716+
didExec2Run = false
1717+
}
1718+
1719+
// 7. Allow exec=1 to proceed
1720+
t.Log("7. Allow exec=1 to proceed")
1721+
err = exec1DelayFifo.Trigger()
1722+
assert.NoError(t, err, "trigger exec1 fifo")
1723+
status, err = task.Status(ctx)
1724+
require.NoError(t, err, "container status")
1725+
t.Log("container status", status.Status)
1726+
<-exec1done
1727+
status, err = task.Status(ctx)
1728+
require.NoError(t, err, "container status")
1729+
t.Log("container status", status.Status)
1730+
1731+
// 8. Allow exec=2 to proceed
1732+
if didExec2Run {
1733+
t.Log("8. Allow exec2 to proceed")
1734+
err = exec2DelayFifo.Trigger()
1735+
assert.NoError(t, err, "trigger exec2 fifo")
1736+
status, err = task.Status(ctx)
1737+
require.NoError(t, err, "container status")
1738+
t.Log("container status", status.Status)
1739+
<-exec2done
1740+
status, err = task.Status(ctx)
1741+
require.NoError(t, err, "container status")
1742+
t.Log("container status", status.Status)
1743+
} else {
1744+
t.Log("8. Skip exec2")
1745+
}
1746+
1747+
// 9. Validate
1748+
t.Log("9. Validate")
1749+
status, err = exec1.Status(ctx)
1750+
require.NoError(t, err, "exec1 status")
1751+
t.Logf("exec1 status: %s", status.Status)
1752+
assert.Equal(t, Created, status.Status)
1753+
status, err = exec2.Status(ctx)
1754+
require.NoError(t, err, "exec2 status")
1755+
t.Logf("exec2 status: %s", status.Status)
1756+
assert.Equal(t, Created, status.Status)
1757+
status, err = task.Status(ctx)
1758+
t.Logf("task status: %s", status.Status)
1759+
require.NoError(t, err, "container status")
1760+
assert.Equal(t, Stopped, status.Status)
1761+
}
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
//go:build linux
2+
3+
/*
4+
Copyright The containerd Authors.
5+
6+
Licensed under the Apache License, Version 2.0 (the "License");
7+
you may not use this file except in compliance with the License.
8+
You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
*/
18+
19+
package main
20+
21+
import (
22+
"context"
23+
"encoding/json"
24+
"errors"
25+
"fmt"
26+
"io"
27+
"os"
28+
"strings"
29+
30+
"github.com/opencontainers/runtime-spec/specs-go"
31+
"github.com/sirupsen/logrus"
32+
33+
"github.com/containerd/containerd/v2/integration/failpoint"
34+
"github.com/containerd/containerd/v2/pkg/fifosync"
35+
)
36+
37+
// delayExec delays an "exec" command until a trigger is received from the calling test program. This can be used to
38+
// test races around container lifecycle and exec processes.
39+
func delayExec(ctx context.Context, method invoker) error {
40+
isExec := strings.Contains(strings.Join(os.Args, ","), ",exec,")
41+
if !isExec {
42+
if err := method(ctx); err != nil {
43+
return err
44+
}
45+
return nil
46+
}
47+
logrus.Debug("EXEC!")
48+
49+
if err := delay(); err != nil {
50+
return err
51+
}
52+
if err := method(ctx); err != nil {
53+
return err
54+
}
55+
return nil
56+
}
57+
58+
func delay() error {
59+
ready, delay, err := fifoFromProcessEnv()
60+
if err != nil {
61+
return err
62+
}
63+
if err := ready.Trigger(); err != nil {
64+
return err
65+
}
66+
return delay.Wait()
67+
}
68+
69+
// fifoFromProcessEnv finds a fifo specified in the environment variables of an exec process
70+
func fifoFromProcessEnv() (fifosync.Trigger, fifosync.Waiter, error) {
71+
env, err := processEnvironment()
72+
if err != nil {
73+
return nil, nil, err
74+
}
75+
76+
readyName, ok := env[failpoint.DelayExecReadyEnv]
77+
if !ok {
78+
return nil, nil, fmt.Errorf("fifo: failed to find %q env var in %v", failpoint.DelayExecReadyEnv, env)
79+
}
80+
delayName, ok := env[failpoint.DelayExecDelayEnv]
81+
if !ok {
82+
return nil, nil, fmt.Errorf("fifo: failed to find %q env var in %v", failpoint.DelayExecDelayEnv, env)
83+
}
84+
logrus.WithField("ready", readyName).WithField("delay", delayName).Debug("Found FIFOs!")
85+
readyFIFO, err := fifosync.NewTrigger(readyName, 0600)
86+
if err != nil {
87+
return nil, nil, err
88+
}
89+
delayFIFO, err := fifosync.NewWaiter(delayName, 0600)
90+
if err != nil {
91+
return nil, nil, err
92+
}
93+
return readyFIFO, delayFIFO, nil
94+
}
95+
96+
func processEnvironment() (map[string]string, error) {
97+
idx := 2
98+
for ; idx < len(os.Args); idx++ {
99+
if os.Args[idx] == "--process" {
100+
break
101+
}
102+
}
103+
104+
if idx >= len(os.Args)-1 || os.Args[idx] != "--process" {
105+
return nil, errors.New("env: option --process required")
106+
}
107+
108+
specFile := os.Args[idx+1]
109+
f, err := os.OpenFile(specFile, os.O_RDONLY, 0o644)
110+
if err != nil {
111+
return nil, fmt.Errorf("env: failed to open %s: %w", specFile, err)
112+
}
113+
114+
b, err := io.ReadAll(f)
115+
if err != nil {
116+
return nil, fmt.Errorf("env: failed to read spec from %q", specFile)
117+
}
118+
var spec specs.Process
119+
if err := json.Unmarshal(b, &spec); err != nil {
120+
return nil, fmt.Errorf("env: failed to unmarshal spec from %q: %w", specFile, err)
121+
}
122+
123+
// XXX: env vars can be specified multiple times, but we only keep one
124+
env := make(map[string]string)
125+
for _, e := range spec.Env {
126+
k, v, _ := strings.Cut(e, "=")
127+
env[k] = v
128+
}
129+
130+
return env, nil
131+
}

integration/failpoint/cmd/runc-fp/main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ import (
2525
"os/exec"
2626
"syscall"
2727

28-
"github.com/containerd/containerd/v2/pkg/oci"
2928
"github.com/sirupsen/logrus"
29+
30+
"github.com/containerd/containerd/v2/pkg/oci"
3031
)
3132

3233
const (
@@ -40,6 +41,7 @@ type invokerInterceptor func(context.Context, invoker) error
4041
var (
4142
failpointProfiles = map[string]invokerInterceptor{
4243
"issue9103": issue9103KillInitAfterCreate,
44+
"delayExec": delayExec,
4345
}
4446
)
4547

0 commit comments

Comments
 (0)