@@ -34,14 +34,18 @@ import (
34
34
"github.com/containerd/cgroups/v3/cgroup1"
35
35
cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2"
36
36
"github.com/containerd/containerd/api/types/runc/options"
37
+ "github.com/containerd/errdefs"
38
+ "github.com/stretchr/testify/assert"
39
+
37
40
. "github.com/containerd/containerd/v2/client"
38
41
"github.com/containerd/containerd/v2/core/containers"
42
+ "github.com/containerd/containerd/v2/integration/failpoint"
39
43
"github.com/containerd/containerd/v2/pkg/cio"
44
+ "github.com/containerd/containerd/v2/pkg/fifosync"
40
45
"github.com/containerd/containerd/v2/pkg/oci"
41
46
"github.com/containerd/containerd/v2/pkg/shim"
42
47
"github.com/containerd/containerd/v2/pkg/sys"
43
48
"github.com/containerd/containerd/v2/plugins"
44
- "github.com/containerd/errdefs"
45
49
46
50
"github.com/opencontainers/runtime-spec/specs-go"
47
51
"github.com/stretchr/testify/require"
@@ -1551,3 +1555,207 @@ func TestIssue9103(t *testing.T) {
1551
1555
})
1552
1556
}
1553
1557
}
1558
+
1559
+ // TestIssue10589 is used as regression case for issue 10589.
1560
+ //
1561
+ // This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates
1562
+ // by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This
1563
+ // allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist.
1564
+ //
1565
+ // The workflow is as follows:
1566
+ // 1. Create a container as normal
1567
+ // 2. Make an exec1 using runc-fp with delayexec
1568
+ // 3. Wait until the exec is waiting to start (triggered by delayexec)
1569
+ // 4. Kill the container init process (signalling it is easiest)
1570
+ // 5. Make an exec2 using runc-fp with delayexec
1571
+ // 6. Wait until the exec is waiting to start
1572
+ // 7. Allow exec1 to proceed
1573
+ // 8. Allow exec2 to proceed
1574
+ // 9. See that the container has exited and all execs have exited too
1575
+ //
1576
+ // https://github.com/containerd/containerd/issues/10589
1577
+ func TestIssue10589 (t * testing.T ) {
1578
+ if f := os .Getenv ("RUNC_FLAVOR" ); f != "" && f != "runc" {
1579
+ t .Skip ("test requires runc" )
1580
+ }
1581
+
1582
+ client , err := newClient (t , address )
1583
+ require .NoError (t , err )
1584
+ t .Cleanup (func () {
1585
+ client .Close ()
1586
+ })
1587
+
1588
+ var (
1589
+ image Image
1590
+ ctx , cancel = testContext (t )
1591
+ id = t .Name ()
1592
+ )
1593
+ t .Cleanup (cancel )
1594
+
1595
+ image , err = client .GetImage (ctx , testImage )
1596
+ require .NoError (t , err )
1597
+
1598
+ // 1. Create a sleeping container
1599
+ t .Log ("1. Create a sleeping container" )
1600
+ container , err := client .NewContainer (ctx , id ,
1601
+ WithNewSnapshot (id , image ),
1602
+ WithNewSpec (oci .WithImageConfig (image ),
1603
+ withProcessArgs ("sleep" , "inf" ),
1604
+ oci .WithAnnotations (map [string ]string {
1605
+ "oci.runc.failpoint.profile" : "delayExec" ,
1606
+ }),
1607
+ ),
1608
+ WithRuntime (client .Runtime (), & options.Options {
1609
+ BinaryName : "runc-fp" ,
1610
+ }),
1611
+ )
1612
+ require .NoError (t , err , "create container" )
1613
+ t .Cleanup (func () {
1614
+ ctx , cancel := context .WithTimeout (ctx , 10 * time .Second )
1615
+ err := container .Delete (ctx , WithSnapshotCleanup )
1616
+ if err != nil {
1617
+ t .Log ("delete err" , err )
1618
+ }
1619
+ cancel ()
1620
+ })
1621
+
1622
+ task , err := container .NewTask (ctx , empty ())
1623
+ require .NoError (t , err , "create task" )
1624
+ t .Cleanup (func () {
1625
+ ctx , cancel := context .WithTimeout (ctx , 2 * time .Second )
1626
+ st , err := task .Delete (ctx , WithProcessKill )
1627
+ t .Log ("exit status" , st )
1628
+ if err != nil {
1629
+ t .Log ("kill err" , err )
1630
+ }
1631
+ cancel ()
1632
+ })
1633
+
1634
+ err = task .Start (ctx )
1635
+ require .NoError (t , err , "start container" )
1636
+
1637
+ status , err := task .Status (ctx )
1638
+ require .NoError (t , err , "container status" )
1639
+ require .Equal (t , Running , status .Status )
1640
+
1641
+ // 2. Create an exec
1642
+ t .Log ("2. Create exec1" )
1643
+ exec1ReadyFifo , err := fifosync .NewWaiter (filepath .Join (t .TempDir (), "exec1-ready.fifo" ), 0600 )
1644
+ require .NoError (t , err , "create exec1 ready fifo" )
1645
+ exec1DelayFifo , err := fifosync .NewTrigger (filepath .Join (t .TempDir (), "exec1-delay.fifo" ), 0600 )
1646
+ require .NoError (t , err , "create exec1 delay fifo" )
1647
+ exec1 , err := task .Exec (ctx , "exec1" , & specs.Process {
1648
+ Args : []string {"/bin/sleep" , "301" },
1649
+ Cwd : "/" ,
1650
+ Env : []string {
1651
+ failpoint .DelayExecReadyEnv + "=" + exec1ReadyFifo .Name (),
1652
+ failpoint .DelayExecDelayEnv + "=" + exec1DelayFifo .Name (),
1653
+ },
1654
+ }, cio .NullIO )
1655
+ require .NoError (t , err , "create exec1" )
1656
+
1657
+ exec1done := make (chan struct {})
1658
+ go func () {
1659
+ defer close (exec1done )
1660
+ t .Log ("Starting exec1" )
1661
+ err := exec1 .Start (ctx )
1662
+ assert .Error (t , err , "start exec1" )
1663
+ t .Logf ("error starting exec1: %s" , err )
1664
+ }()
1665
+
1666
+ // 3. Wait until the exec is waiting to start
1667
+ t .Log ("3. Wait until exec1 is waiting to start" )
1668
+ err = exec1ReadyFifo .Wait ()
1669
+ require .NoError (t , err , "open exec1 fifo" )
1670
+
1671
+ // 4. Kill the container init process
1672
+ t .Log ("4. Kill the container init process" )
1673
+ target := task .Pid ()
1674
+ t .Logf ("Killing main pid (%v) of container %s" , target , container .ID ())
1675
+ syscall .Kill (int (target ), syscall .SIGKILL )
1676
+ status , err = task .Status (ctx )
1677
+ require .NoError (t , err , "container status" )
1678
+ t .Log ("container status" , status .Status )
1679
+
1680
+ // 5. Make an exec (2) using this failpoint
1681
+ t .Log ("5. Create exec2" )
1682
+ exec2ReadyFifo , err := fifosync .NewWaiter (filepath .Join (t .TempDir (), "exec2-ready.fifo" ), 0600 )
1683
+ require .NoError (t , err , "create exec2 ready fifo: %q" , exec2ReadyFifo )
1684
+ exec2DelayFifo , err := fifosync .NewTrigger (filepath .Join (t .TempDir (), "exec2-delay.fifo" ), 0600 )
1685
+ require .NoError (t , err , "create exec2 delay fifo: %q" , exec2DelayFifo )
1686
+ exec2 , err := task .Exec (ctx , "exec2" , & specs.Process {
1687
+ Args : []string {"/bin/sleep" , "302" },
1688
+ Cwd : "/" ,
1689
+ Env : []string {
1690
+ failpoint .DelayExecReadyEnv + "=" + exec2ReadyFifo .Name (),
1691
+ failpoint .DelayExecDelayEnv + "=" + exec2DelayFifo .Name (),
1692
+ },
1693
+ }, cio .NullIO )
1694
+ require .NoError (t , err , "create exec2" )
1695
+
1696
+ exec2done := make (chan struct {})
1697
+ didExec2Run := true
1698
+ go func () {
1699
+ defer close (exec2done )
1700
+ t .Log ("Starting exec2" )
1701
+ err := exec2 .Start (ctx )
1702
+ assert .Error (t , err , "start exec2" )
1703
+ t .Logf ("error starting exec2: %s" , err )
1704
+ }()
1705
+
1706
+ // 6. Wait until the exec is waiting to start
1707
+ t .Log ("6. Wait until exec2 is waiting to start" )
1708
+ exec2ready := make (chan struct {})
1709
+ go func () {
1710
+ exec2ReadyFifo .Wait ()
1711
+ close (exec2ready )
1712
+ }()
1713
+ select {
1714
+ case <- exec2ready :
1715
+ case <- exec2done :
1716
+ didExec2Run = false
1717
+ }
1718
+
1719
+ // 7. Allow exec=1 to proceed
1720
+ t .Log ("7. Allow exec=1 to proceed" )
1721
+ err = exec1DelayFifo .Trigger ()
1722
+ assert .NoError (t , err , "trigger exec1 fifo" )
1723
+ status , err = task .Status (ctx )
1724
+ require .NoError (t , err , "container status" )
1725
+ t .Log ("container status" , status .Status )
1726
+ <- exec1done
1727
+ status , err = task .Status (ctx )
1728
+ require .NoError (t , err , "container status" )
1729
+ t .Log ("container status" , status .Status )
1730
+
1731
+ // 8. Allow exec=2 to proceed
1732
+ if didExec2Run {
1733
+ t .Log ("8. Allow exec2 to proceed" )
1734
+ err = exec2DelayFifo .Trigger ()
1735
+ assert .NoError (t , err , "trigger exec2 fifo" )
1736
+ status , err = task .Status (ctx )
1737
+ require .NoError (t , err , "container status" )
1738
+ t .Log ("container status" , status .Status )
1739
+ <- exec2done
1740
+ status , err = task .Status (ctx )
1741
+ require .NoError (t , err , "container status" )
1742
+ t .Log ("container status" , status .Status )
1743
+ } else {
1744
+ t .Log ("8. Skip exec2" )
1745
+ }
1746
+
1747
+ // 9. Validate
1748
+ t .Log ("9. Validate" )
1749
+ status , err = exec1 .Status (ctx )
1750
+ require .NoError (t , err , "exec1 status" )
1751
+ t .Logf ("exec1 status: %s" , status .Status )
1752
+ assert .Equal (t , Created , status .Status )
1753
+ status , err = exec2 .Status (ctx )
1754
+ require .NoError (t , err , "exec2 status" )
1755
+ t .Logf ("exec2 status: %s" , status .Status )
1756
+ assert .Equal (t , Created , status .Status )
1757
+ status , err = task .Status (ctx )
1758
+ t .Logf ("task status: %s" , status .Status )
1759
+ require .NoError (t , err , "container status" )
1760
+ assert .Equal (t , Stopped , status .Status )
1761
+ }
0 commit comments