Skip to content

Commit f7eee72

Browse files
committed
WIP: Forward parent death to descendant processes (unix)
If the uds_fd connection to the parent BEAM is broken or closed, react by killing all spawned children. When a spawned port is closed, kill the associated OS process. A concise demonstration of the problem being solved is to run the following command with and without the patch, then kill the BEAM. Without the patch, the "sleep" process will continue: erl -noshell -eval 'os:cmd("sleep 60")' Keeps the mapping of all living child processes so that it's possible to iterate over them during clean up. Previously, child processes were only stored in forker_hash if the :in bit was set. A new forker command message is introduced, which allows it to kill the child and clean up internal resources if the port is closed before the process ends naturally. TODO: * Needs a decision made between killing the process or process group. * Separate patch for win32
1 parent f64ad0a commit f7eee72

File tree

4 files changed

+124
-79
lines changed

4 files changed

+124
-79
lines changed

erts/emulator/sys/unix/erl_child_setup.c

Lines changed: 77 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,15 @@
9696
# define FD_ZERO(FD_SET_PTR) memset(FD_SET_PTR, 0, sizeof(fd_set))
9797
#endif
9898

99+
typedef struct exit_status {
100+
HashBucket hb;
101+
pid_t os_pid;
102+
Eterm port_id;
103+
bool want_exit_status;
104+
} ErtsSysExitStatus;
105+
106+
static Hash *forker_hash;
107+
99108
static char abort_reason[200]; /* for core dump inspection */
100109

101110
static void ABORT(const char* fmt, ...)
@@ -174,8 +183,8 @@ static ssize_t write_all(int fd, const char *buff, size_t size) {
174183
return pos;
175184
}
176185

177-
static void add_os_pid_to_port_id_mapping(Eterm, pid_t);
178-
static Eterm get_port_id(pid_t);
186+
static void kill_child(pid_t os_pid);
187+
static void kill_all_children(void);
179188
static int forker_hash_init(void);
180189

181190
static int max_files = -1;
@@ -564,6 +573,7 @@ main(int argc, char *argv[])
564573
tcsetattr(0,TCSANOW,&initial_tty_mode);
565574
}
566575
DEBUG_PRINT("erl_child_setup failed to read from uds: %d, %d", res, errno);
576+
kill_all_children();
567577
_exit(0);
568578
}
569579

@@ -572,97 +582,104 @@ main(int argc, char *argv[])
572582
if (isatty(0) && isatty(1)) {
573583
tcsetattr(0,TCSANOW,&initial_tty_mode);
574584
}
585+
kill_all_children();
575586
_exit(0);
576587
}
577588
/* Since we use unix domain sockets and send the entire data in
578589
one go we *should* get the entire payload at once. */
579590
ASSERT(res == sizeof(proto));
580-
ASSERT(proto.action == ErtsSysForkerProtoAction_Start);
591+
if (proto.action == ErtsSysForkerProtoAction_Start) {
592+
ErtsSysExitStatus es;
581593

582-
sys_sigblock(SIGCHLD);
594+
sys_sigblock(SIGCHLD);
583595

584-
errno = 0;
596+
errno = 0;
585597

586-
os_pid = fork();
587-
if (os_pid == 0)
588-
start_new_child(pipes);
598+
os_pid = fork();
599+
if (os_pid == 0)
600+
start_new_child(pipes);
589601

590-
add_os_pid_to_port_id_mapping(proto.u.start.port_id, os_pid);
602+
es.os_pid = os_pid;
603+
es.port_id = proto.u.start.port_id;
604+
es.want_exit_status = proto.u.start.want_exit_status;
605+
hash_put(forker_hash, &es);
591606

592-
/* We write an ack here, but expect the reply on
593-
the pipes[0] inside the fork */
594-
proto.action = ErtsSysForkerProtoAction_Go;
595-
proto.u.go.os_pid = os_pid;
596-
proto.u.go.error_number = errno;
597-
write_all(pipes[1], (char *)&proto, sizeof(proto));
607+
/* We write an ack here, but expect the reply on
608+
the pipes[0] inside the fork */
609+
proto.action = ErtsSysForkerProtoAction_Go;
610+
proto.u.go.os_pid = os_pid;
611+
proto.u.go.error_number = errno;
612+
write_all(pipes[1], (char *)&proto, sizeof(proto));
598613

599614
#ifdef FORKER_PROTO_START_ACK
600-
proto.action = ErtsSysForkerProtoAction_StartAck;
601-
write_all(uds_fd, (char *)&proto, sizeof(proto));
615+
proto.action = ErtsSysForkerProtoAction_StartAck;
616+
write_all(uds_fd, (char *)&proto, sizeof(proto));
602617
#endif
603618

604-
sys_sigrelease(SIGCHLD);
605-
close(pipes[0]);
606-
close(pipes[1]);
607-
close(pipes[2]);
619+
sys_sigrelease(SIGCHLD);
620+
close(pipes[0]);
621+
close(pipes[1]);
622+
close(pipes[2]);
623+
} else if (proto.action == ErtsSysForkerProtoAction_Stop) {
624+
ErtsSysExitStatus est, *es;
625+
est.os_pid = proto.u.stop.os_pid;
626+
es = hash_remove(forker_hash, &est);
627+
if (es) {
628+
kill_child(es->os_pid);
629+
free(es);
630+
}
631+
} else {
632+
#ifdef DEBUG
633+
ABORT("Unknown command from parent: %d", proto.action);
634+
#endif
635+
}
608636
}
609637

610638
if (FD_ISSET(sigchld_pipe[0], &read_fds)) {
611639
int ibuff[2];
612640
ErtsSysForkerProto proto;
641+
ErtsSysExitStatus est, *es;
613642
res = read_all(sigchld_pipe[0], (char *)ibuff, sizeof(ibuff));
614643
if (res <= 0) {
615644
ABORT("Failed to read from sigchld pipe: %d (%d)", res, errno);
616645
}
617646

618-
proto.u.sigchld.port_id = get_port_id((pid_t)(ibuff[0]));
619-
620-
if (proto.u.sigchld.port_id == THE_NON_VALUE)
621-
continue; /* exit status report not requested */
622-
623-
proto.action = ErtsSysForkerProtoAction_SigChld;
624-
proto.u.sigchld.error_number = ibuff[1];
625-
DEBUG_PRINT("send sigchld to %d (errno = %d)", uds_fd, ibuff[1]);
626-
if (write_all(uds_fd, (char *)&proto, sizeof(proto)) < 0) {
627-
/* The uds was close, which most likely means that the VM
628-
has exited. This will be detected when we try to read
629-
from the uds_fd. */
630-
DEBUG_PRINT("Failed to write to uds: %d (%d)", uds_fd, errno);
647+
est.os_pid = (pid_t)ibuff[0];
648+
es = hash_remove(forker_hash, &est);
649+
650+
if (es && es->want_exit_status) {
651+
proto.action = ErtsSysForkerProtoAction_SigChld;
652+
proto.u.sigchld.port_id = es->port_id;
653+
proto.u.sigchld.error_number = ibuff[1];
654+
DEBUG_PRINT("send sigchld to %d (errno = %d)", uds_fd, ibuff[1]);
655+
if (write_all(uds_fd, (char *)&proto, sizeof(proto)) < 0) {
656+
/* The uds was close, which most likely means that the VM
657+
has exited. This will be detected when we try to read
658+
from the uds_fd. */
659+
DEBUG_PRINT("Failed to write to uds: %d (%d)", uds_fd, errno);
660+
}
661+
free(es);
631662
}
632663
}
633664
}
634665
return 1;
635666
}
636667

637-
typedef struct exit_status {
638-
HashBucket hb;
639-
pid_t os_pid;
640-
Eterm port_id;
641-
} ErtsSysExitStatus;
642-
643-
static Hash *forker_hash;
668+
/* Kill child process groups on VM termination so they don't become orphaned. */
644669

645-
static void add_os_pid_to_port_id_mapping(Eterm port_id, pid_t os_pid)
646-
{
647-
if (port_id != THE_NON_VALUE) {
648-
/* exit status report requested */
649-
ErtsSysExitStatus es;
650-
es.os_pid = os_pid;
651-
es.port_id = port_id;
652-
hash_put(forker_hash, &es);
670+
static void kill_child(pid_t os_pid) {
671+
if (os_pid > 0 && kill(os_pid, SIGTERM) != 0) {
672+
DEBUG_PRINT("error killing process %d: %d", os_pid, errno);
653673
}
654674
}
655675

656-
static Eterm get_port_id(pid_t os_pid)
657-
{
658-
ErtsSysExitStatus est, *es;
659-
Eterm port_id;
660-
est.os_pid = os_pid;
661-
es = hash_remove(forker_hash, &est);
662-
if (!es) return THE_NON_VALUE;
663-
port_id = es->port_id;
664-
free(es);
665-
return port_id;
676+
static void fun_kill_foreach(ErtsSysExitStatus *es, void *unused) {
677+
kill_child(es->os_pid);
678+
}
679+
680+
static void kill_all_children(void) {
681+
DEBUG_PRINT("cleaning up by killing all %d child processes", forker_hash->nobjs);
682+
hash_foreach(forker_hash, (HFOREACH_FUN)fun_kill_foreach, NULL);
666683
}
667684

668685
static int fcmp(void *a, void *b)
@@ -691,6 +708,7 @@ static void *falloc(void *e)
691708
ErtsSysExitStatus *ne = malloc(sizeof(ErtsSysExitStatus));
692709
ne->os_pid = se->os_pid;
693710
ne->port_id = se->port_id;
711+
ne->want_exit_status = se->want_exit_status;
694712
return ne;
695713
}
696714

erts/emulator/sys/unix/erl_child_setup.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,14 @@ typedef struct ErtsSysForkerProto_ {
5656
ErtsSysForkerProtoAction_StartAck,
5757
ErtsSysForkerProtoAction_Go,
5858
ErtsSysForkerProtoAction_SigChld,
59-
ErtsSysForkerProtoAction_Ack
59+
ErtsSysForkerProtoAction_Ack,
60+
ErtsSysForkerProtoAction_Stop
6061
} action;
6162
union {
6263
struct {
6364
ErtsSysPortId port_id;
6465
int fds[3];
66+
bool want_exit_status;
6567
} start;
6668
struct {
6769
pid_t os_pid;
@@ -71,6 +73,9 @@ typedef struct ErtsSysForkerProto_ {
7173
ErtsSysPortId port_id;
7274
int error_number;
7375
} sigchld;
76+
struct {
77+
pid_t os_pid;
78+
} stop;
7479
} u;
7580
} ErtsSysForkerProto;
7681

erts/emulator/sys/unix/sys_drivers.c

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -703,7 +703,8 @@ static ErlDrvData spawn_start(ErlDrvPort port_num, char* name,
703703
proto->u.start.fds[0] = ofd[0];
704704
proto->u.start.fds[1] = ifd[1];
705705
proto->u.start.fds[2] = stderrfd;
706-
proto->u.start.port_id = opts->exit_status ? erts_drvport2id(port_num) : THE_NON_VALUE;
706+
proto->u.start.port_id = erts_drvport2id(port_num);
707+
proto->u.start.want_exit_status = opts->exit_status;
707708
if (erl_drv_port_control(forker_port, ERTS_FORKER_DRV_CONTROL_MAGIC_NUMBER,
708709
(char*)proto, sizeof(*proto))) {
709710
/* The forker port has been killed, we close both fd's which will
@@ -1046,6 +1047,16 @@ static void stop(ErlDrvData ev)
10461047
driver_select(prt, abs(dd->ofd->fd), ERL_DRV_USE, 0); /* close(ofd); */
10471048
}
10481049

1050+
if (dd->pid > 0) {
1051+
ErtsSysForkerProto *proto =
1052+
erts_alloc(ERTS_ALC_T_DRV_CTRL_DATA, sizeof(ErtsSysForkerProto));
1053+
memset(proto, 0, sizeof(ErtsSysForkerProto));
1054+
proto->action = ErtsSysForkerProtoAction_Stop;
1055+
proto->u.stop.os_pid = dd->pid;
1056+
erl_drv_port_control(forker_port, ERTS_FORKER_DRV_CONTROL_MAGIC_NUMBER,
1057+
(char*)proto, sizeof(*proto));
1058+
}
1059+
10491060
erts_free(ERTS_ALC_T_DRV_TAB, dd);
10501061
}
10511062

@@ -1814,29 +1825,35 @@ static ErlDrvSSizeT forker_control(ErlDrvData e, unsigned int cmd, char *buf,
18141825
first_call = 0;
18151826
}
18161827

1817-
driver_enq(port_num, buf, len);
1818-
if (driver_sizeq(port_num) > sizeof(*proto)) {
1819-
return 0;
1820-
}
1821-
1822-
if ((res = sys_uds_write(forker_fd, (char*)proto, sizeof(*proto),
1823-
proto->u.start.fds, 3, 0)) < 0) {
1824-
if (errno == ERRNO_BLOCK || errno == EINTR) {
1825-
driver_select(port_num, forker_fd, ERL_DRV_WRITE|ERL_DRV_USE, 1);
1826-
return 0;
1827-
} else if (errno == EMFILE) {
1828-
forker_sigchld(proto->u.start.port_id, errno);
1829-
forker_deq(port_num, proto);
1828+
if (proto->action == ErtsSysForkerProtoAction_Start) {
1829+
driver_enq(port_num, buf, len);
1830+
if (driver_sizeq(port_num) > sizeof(*proto)) {
18301831
return 0;
1831-
} else {
1832-
erts_exit(ERTS_DUMP_EXIT, "Failed to write to erl_child_setup: %d\n", errno);
18331832
}
1834-
}
1833+
1834+
if ((res = sys_uds_write(forker_fd, (char*)proto, sizeof(*proto),
1835+
proto->u.start.fds, 3, 0)) < 0) {
1836+
if (errno == ERRNO_BLOCK || errno == EINTR) {
1837+
driver_select(port_num, forker_fd, ERL_DRV_WRITE|ERL_DRV_USE, 1);
1838+
return 0;
1839+
} else if (errno == EMFILE) {
1840+
forker_sigchld(proto->u.start.port_id, errno);
1841+
forker_deq(port_num, proto);
1842+
return 0;
1843+
} else {
1844+
erts_exit(ERTS_DUMP_EXIT, "Failed to write to erl_child_setup: %d\n", errno);
1845+
}
1846+
}
18351847

18361848
#ifndef FORKER_PROTO_START_ACK
1837-
ASSERT(res == sizeof(*proto));
1838-
forker_deq(port_num, proto);
1849+
ASSERT(res == sizeof(*proto));
1850+
forker_deq(port_num, proto);
18391851
#endif
1852+
} else if (proto->action == ErtsSysForkerProtoAction_Stop) {
1853+
if ((res = write(forker_fd, (char*)proto, sizeof(*proto))) < 0) {
1854+
erts_exit(ERTS_DUMP_EXIT, "Failed to write stop to erl_child_setup: %d\n", errno);
1855+
}
1856+
}
18401857

18411858
return 0;
18421859
}

erts/preloaded/src/erlang.erl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7475,6 +7475,11 @@ reported to the owning process using signals of the form
74757475

74767476
The maximum number of ports that can be open at the same time can be configured
74777477
by passing command-line flag [`+Q`](erl_cmd.md#max_ports) to [erl](erl_cmd.md).
7478+
7479+
When a port is closed or the VM shuts down, spawned executables are sent a
7480+
`SIGTERM` on unix. The child may still outlive the VM if it traps the signal.
7481+
Note that any processes started under a shell using `spawn` will not terminate
7482+
unless they respond to stdin or stdout being closed.
74787483
""".
74797484
-doc #{ category => ports }.
74807485
-spec open_port(PortName, PortSettings) -> port() when

0 commit comments

Comments
 (0)