Skip to content

Commit

Permalink
Fix race condition on ssh client connection startup
Browse files Browse the repository at this point in the history
There were 2 race conditions that can happen on the startup of
a SSH client connection:
- two process race to create the ssh_system_sup for a given
address, which can cause the start of the ssh_system_sup to return
{error, {already_started, Pid}};
- one process race to create the ssh_system_sup when it's being
brought down due to all significant processes have exited.

This commit fixes these race conditions and additionally fixes a third issue
that would happen when two processes race to create the ssh_system_sup, an
ssh_acceptor_sup would be created for the address the client is trying to
connect.
  • Loading branch information
alexandrejbr authored and Alexandre Rodrigues committed Aug 10, 2023
1 parent ae1175f commit b5969be
Showing 1 changed file with 31 additions and 12 deletions.
43 changes: 31 additions & 12 deletions lib/ssh/src/ssh_system_sup.erl
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,26 @@

start_system(Role, Address0, Options) ->
case find_system_sup(Role, Address0) of
{ok,{SysPid,Address}} ->
{ok,{SysPid,Address}} when Role =:= server ->
restart_acceptor(SysPid, Address, Options);
{ok,{SysPid,_Address}} ->
{ok, SysPid};
{error,not_found} ->
supervisor:start_child(sup(Role),
case supervisor:start_child(sup(Role),
#{id => {?MODULE,Address0},
start => {?MODULE, start_link, [Role, Address0, Options]},
restart => temporary,
type => supervisor
})
of
{ok, SysPid} ->
{ok, SysPid};
{error, {already_started, SysPid}} ->
%% There was other connection that created the supervisor while
%% this process was trying to create it as well
{ok, SysPid};
Others -> Others
end
end.

%%%----------------------------------------------------------------
Expand Down Expand Up @@ -100,16 +111,7 @@ start_subsystem(Role, Address=#address{}, Socket, Options0) ->
Id = make_ref(),
case get_system_sup(Role, Address, Options) of
{ok,SysPid} ->
case supervisor:start_child(SysPid,
#{id => Id,
start => {ssh_subsystem_sup, start_link,
[Role,Address,Id,Socket,Options]
},
restart => temporary,
significant => true,
type => supervisor
})
of
case start_subsystem_sup(SysPid, Id, [Role,Address,Id,Socket,Options]) of
{ok,_SubSysPid} ->
try
receive
Expand All @@ -129,13 +131,30 @@ start_subsystem(Role, Address=#address{}, Socket, Options0) ->
supervisor:terminate_child(SysPid, Id),
{error, connection_start_timeout}
end;
{error,noproc} ->
%% The supervisor has shut itself down due to the termination of
%% all the significant children
start_subsystem(Role, Address, Socket, Options0);
Others ->
Others
end;
Others ->
Others
end.

start_subsystem_sup(SysPid, Id, StartArgs) ->
try
supervisor:start_child(SysPid,
#{id => Id,
start => {ssh_subsystem_sup, start_link, StartArgs},
restart => temporary,
significant => true,
type => supervisor
})
catch
exit:{noproc,_} ->
{error, noproc}
end.

%%%----------------------------------------------------------------
start_link(Role, Address, Options) ->
Expand Down

0 comments on commit b5969be

Please sign in to comment.