Skip to content

Commit 3f4eaae

Browse files
authored
Fix db client deadlocks with non-blocking cleanup and RW locks (#4918)
1 parent c2421b0 commit 3f4eaae

File tree

10 files changed

+211
-36
lines changed

10 files changed

+211
-36
lines changed

cmd/plugin_manager.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,15 @@ func doRunPluginManager(cmd *cobra.Command) error {
6060
log.Printf("[INFO] starting connection watcher")
6161
connectionWatcher, err := connection.NewConnectionWatcher(pluginManager)
6262
if err != nil {
63+
log.Printf("[ERROR] failed to create connection watcher: %v", err)
6364
return err
6465
}
66+
log.Printf("[INFO] connection watcher created successfully")
6567

6668
// close the connection watcher
6769
defer connectionWatcher.Close()
70+
} else {
71+
log.Printf("[WARN] connection watcher is DISABLED")
6872
}
6973

7074
log.Printf("[INFO] about to serve")

pkg/connection/connection_watcher.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,19 @@ func NewConnectionWatcher(pluginManager pluginManager) (*ConnectionWatcher, erro
2626
pluginManager: pluginManager,
2727
}
2828

29+
configDir := filepaths.EnsureConfigDir()
30+
log.Printf("[INFO] ConnectionWatcher will watch directory: %s for %s files", configDir, constants.ConfigExtension)
31+
2932
watcherOptions := &filewatcher.WatcherOptions{
30-
Directories: []string{filepaths.EnsureConfigDir()},
33+
Directories: []string{configDir},
3134
Include: filehelpers.InclusionsFromExtensions([]string{constants.ConfigExtension}),
3235
ListFlag: filehelpers.FilesRecursive,
3336
EventMask: fsnotify.Create | fsnotify.Remove | fsnotify.Rename | fsnotify.Write | fsnotify.Chmod,
3437
OnChange: func(events []fsnotify.Event) {
38+
log.Printf("[INFO] ConnectionWatcher detected %d file events", len(events))
39+
for _, event := range events {
40+
log.Printf("[INFO] ConnectionWatcher event: %s - %s", event.Op, event.Name)
41+
}
3542
w.handleFileWatcherEvent(events)
3643
},
3744
}
@@ -80,13 +87,17 @@ func (w *ConnectionWatcher) handleFileWatcherEvent([]fsnotify.Event) {
8087
// as these are both used by RefreshConnectionAndSearchPathsWithLocalClient
8188

8289
// set the global steampipe config
90+
log.Printf("[DEBUG] ConnectionWatcher: setting GlobalConfig")
8391
steampipeconfig.GlobalConfig = config
8492

8593
// call on changed callback - we must call this BEFORE calling refresh connections
8694
// convert config to format expected by plugin manager
8795
// (plugin manager cannot reference steampipe config to avoid circular deps)
96+
log.Printf("[DEBUG] ConnectionWatcher: creating connection config map")
8897
configMap := NewConnectionConfigMap(config.Connections)
98+
log.Printf("[DEBUG] ConnectionWatcher: calling OnConnectionConfigChanged with %d connections", len(configMap))
8999
w.pluginManager.OnConnectionConfigChanged(ctx, configMap, config.PluginsInstances)
100+
log.Printf("[DEBUG] ConnectionWatcher: OnConnectionConfigChanged complete")
90101

91102
// The only configurations from GlobalConfig which have
92103
// impact during Refresh are Database options and the Connections
@@ -99,7 +110,9 @@ func (w *ConnectionWatcher) handleFileWatcherEvent([]fsnotify.Event) {
99110
// Workspace Profile does not have any setting which can alter
100111
// behavior in service mode (namely search path). Therefore, it is safe
101112
// to use the GlobalConfig here and ignore Workspace Profile in general
113+
log.Printf("[DEBUG] ConnectionWatcher: calling SetDefaultsFromConfig")
102114
cmdconfig.SetDefaultsFromConfig(steampipeconfig.GlobalConfig.ConfigMap())
115+
log.Printf("[DEBUG] ConnectionWatcher: SetDefaultsFromConfig complete")
103116

104117
log.Printf("[INFO] calling RefreshConnections asyncronously")
105118

pkg/db/db_client/db_client.go

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,14 @@ type DbClient struct {
4949
sessions map[uint32]*db_common.DatabaseSession
5050

5151
// allows locked access to the 'sessions' map
52-
sessionsMutex *sync.Mutex
52+
sessionsMutex *sync.Mutex
53+
sessionsLockFlag atomic.Bool
5354

5455
// if a custom search path or a prefix is used, store it here
5556
customSearchPath []string
5657
searchPathPrefix []string
5758
// allows locked access to customSearchPath and searchPathPrefix
58-
searchPathMutex *sync.Mutex
59+
searchPathMutex *sync.RWMutex
5960
// the default user search path
6061
userSearchPath []string
6162
// disable timing - set whilst in process of querying the timing
@@ -73,7 +74,7 @@ func NewDbClient(ctx context.Context, connectionString string, opts ...ClientOpt
7374
parallelSessionInitLock: semaphore.NewWeighted(constants.MaxParallelClientInits),
7475
sessions: make(map[uint32]*db_common.DatabaseSession),
7576
sessionsMutex: &sync.Mutex{},
76-
searchPathMutex: &sync.Mutex{},
77+
searchPathMutex: &sync.RWMutex{},
7778
connectionString: connectionString,
7879
}
7980

@@ -152,6 +153,37 @@ func (c *DbClient) shouldFetchVerboseTiming() bool {
152153
(viper.GetString(pconstants.ArgOutput) == constants.OutputFormatJSON)
153154
}
154155

156+
// lockSessions acquires the sessionsMutex and tracks ownership for tryLock compatibility.
157+
func (c *DbClient) lockSessions() {
158+
if c.sessionsMutex == nil {
159+
return
160+
}
161+
c.sessionsLockFlag.Store(true)
162+
c.sessionsMutex.Lock()
163+
}
164+
165+
// sessionsTryLock attempts to acquire the sessionsMutex without blocking.
166+
// Returns false if the lock is already held.
167+
func (c *DbClient) sessionsTryLock() bool {
168+
if c.sessionsMutex == nil {
169+
return false
170+
}
171+
// best-effort: only one contender sets the flag and proceeds to lock
172+
if !c.sessionsLockFlag.CompareAndSwap(false, true) {
173+
return false
174+
}
175+
c.sessionsMutex.Lock()
176+
return true
177+
}
178+
179+
func (c *DbClient) sessionsUnlock() {
180+
if c.sessionsMutex == nil {
181+
return
182+
}
183+
c.sessionsMutex.Unlock()
184+
c.sessionsLockFlag.Store(false)
185+
}
186+
155187
// ServerSettings returns the settings of the steampipe service that this DbClient is connected to
156188
//
157189
// Keep in mind that when connecting to pre-0.21.x servers, the server_settings data is not available. This is expected.
@@ -173,9 +205,9 @@ func (c *DbClient) Close(context.Context) error {
173205
// nullify active sessions, since with the closing of the pools
174206
// none of the sessions will be valid anymore
175207
// Acquire mutex to prevent concurrent access to sessions map
176-
c.sessionsMutex.Lock()
208+
c.lockSessions()
177209
c.sessions = nil
178-
c.sessionsMutex.Unlock()
210+
c.sessionsUnlock()
179211

180212
return nil
181213
}

pkg/db/db_client/db_client_connect.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,14 @@ func (c *DbClient) establishConnectionPool(ctx context.Context, overrides client
6666
config.BeforeClose = func(conn *pgx.Conn) {
6767
if conn != nil && conn.PgConn() != nil {
6868
backendPid := conn.PgConn().PID()
69-
c.sessionsMutex.Lock()
70-
// Check if sessions map has been nil'd by Close()
71-
if c.sessions != nil {
72-
delete(c.sessions, backendPid)
69+
// Best-effort cleanup: do not block pool.Close() if sessions lock is busy.
70+
if c.sessionsTryLock() {
71+
// Check if sessions map has been nil'd by Close()
72+
if c.sessions != nil {
73+
delete(c.sessions, backendPid)
74+
}
75+
c.sessionsUnlock()
7376
}
74-
c.sessionsMutex.Unlock()
7577
}
7678
}
7779
// set an app name so that we can track database connections from this Steampipe execution

pkg/db/db_client/db_client_search_path.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,8 @@ func (c *DbClient) loadUserSearchPath(ctx context.Context, connection *pgx.Conn)
7878

7979
// GetRequiredSessionSearchPath implements Client
8080
func (c *DbClient) GetRequiredSessionSearchPath() []string {
81-
c.searchPathMutex.Lock()
82-
defer c.searchPathMutex.Unlock()
81+
c.searchPathMutex.RLock()
82+
defer c.searchPathMutex.RUnlock()
8383

8484
if c.customSearchPath != nil {
8585
return c.customSearchPath
@@ -89,8 +89,8 @@ func (c *DbClient) GetRequiredSessionSearchPath() []string {
8989
}
9090

9191
func (c *DbClient) GetCustomSearchPath() []string {
92-
c.searchPathMutex.Lock()
93-
defer c.searchPathMutex.Unlock()
92+
c.searchPathMutex.RLock()
93+
defer c.searchPathMutex.RUnlock()
9494

9595
return c.customSearchPath
9696
}

pkg/db/db_client/db_client_session.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ func (c *DbClient) AcquireSession(ctx context.Context) (sessionResult *db_common
3737
}
3838
backendPid := databaseConnection.Conn().PgConn().PID()
3939

40-
c.sessionsMutex.Lock()
40+
c.lockSessions()
4141
// Check if client has been closed (sessions set to nil)
4242
if c.sessions == nil {
43-
c.sessionsMutex.Unlock()
43+
c.sessionsUnlock()
4444
sessionResult.Error = fmt.Errorf("client has been closed")
4545
return sessionResult
4646
}
@@ -52,7 +52,7 @@ func (c *DbClient) AcquireSession(ctx context.Context) (sessionResult *db_common
5252
// we get a new *sql.Conn everytime. USE IT!
5353
session.Connection = databaseConnection
5454
sessionResult.Session = session
55-
c.sessionsMutex.Unlock()
55+
c.sessionsUnlock()
5656

5757
// make sure that we close the acquired session, in case of error
5858
defer func() {

pkg/db/db_client/db_client_session_test.go

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@ package db_client
22

33
import (
44
"context"
5+
"os"
6+
"strings"
57
"sync"
68
"testing"
79

810
"github.com/stretchr/testify/assert"
11+
"github.com/stretchr/testify/require"
912
"github.com/turbot/steampipe/v2/pkg/db/db_common"
1013
)
1114

@@ -160,6 +163,28 @@ func TestDbClient_SearchPathUpdates(t *testing.T) {
160163
assert.Len(t, client.customSearchPath, 2, "Should have 2 schemas in search path")
161164
}
162165

166+
// TestSearchPathAccessShouldUseReadLocks checks that search path access does not block other goroutines unnecessarily.
167+
//
168+
// Holding an exclusive mutex during search-path reads in concurrent query setup can deadlock when
169+
// another goroutine is setting the path. The current code uses Lock/Unlock; this test documents
170+
// the expectation to move to a read/non-blocking lock so concurrent reads are safe.
171+
func TestSearchPathAccessShouldUseReadLocks(t *testing.T) {
172+
content, err := os.ReadFile("db_client_search_path.go")
173+
require.NoError(t, err, "should be able to read db_client_search_path.go")
174+
175+
source := string(content)
176+
177+
assert.Contains(t, source, "GetRequiredSessionSearchPath", "getter must exist")
178+
assert.Contains(t, source, "searchPathMutex", "getter must guard access to searchPath state")
179+
180+
// Expect a read or non-blocking lock in getters; fail if only full Lock/Unlock is present.
181+
hasRLock := strings.Contains(source, "RLock")
182+
hasTry := strings.Contains(source, "TryLock") || strings.Contains(source, "tryLock")
183+
if !hasRLock && !hasTry {
184+
t.Fatalf("GetRequiredSessionSearchPath should avoid exclusive Lock/Unlock to prevent deadlocks under concurrent query setup")
185+
}
186+
}
187+
163188
// TestDbClient_SessionConnectionNilSafety verifies handling of nil connections
164189
func TestDbClient_SessionConnectionNilSafety(t *testing.T) {
165190
session := db_common.NewDBSession(12345)
@@ -181,7 +206,7 @@ func TestDbClient_SessionSearchPathUpdatesThreadSafe(t *testing.T) {
181206
client := &DbClient{
182207
customSearchPath: []string{"public", "internal"},
183208
userSearchPath: []string{"public"},
184-
searchPathMutex: &sync.Mutex{},
209+
searchPathMutex: &sync.RWMutex{},
185210
}
186211

187212
// Number of concurrent operations to test

pkg/db/db_client/db_client_test.go

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,36 @@ func TestSessionMapCleanupImplemented(t *testing.T) {
5252
"Comment should document automatic cleanup mechanism")
5353
}
5454

55+
// TestBeforeCloseCleanupShouldBeNonBlocking ensures the cleanup hook does not take a blocking lock.
56+
//
57+
// A blocking mutex in the BeforeClose hook can deadlock pool.Close() when another goroutine
58+
// holds sessionsMutex (service stop/restart hangs). This test is intentionally strict and
59+
// will fail until the hook uses a non-blocking strategy (e.g., TryLock or similar).
60+
func TestBeforeCloseCleanupShouldBeNonBlocking(t *testing.T) {
61+
content, err := os.ReadFile("db_client_connect.go")
62+
require.NoError(t, err, "should be able to read db_client_connect.go")
63+
64+
source := string(content)
65+
66+
// Guardrail: the BeforeClose hook should avoid unconditionally blocking on sessionsMutex.
67+
assert.Contains(t, source, "config.BeforeClose", "BeforeClose cleanup hook must exist")
68+
assert.Contains(t, source, "sessionsTryLock", "BeforeClose cleanup should use non-blocking lock helper")
69+
70+
// Expect a non-blocking lock pattern; if we only find Lock()/Unlock, this fails.
71+
nonBlockingPatterns := []string{"TryLock", "tryLock", "non-block", "select {"}
72+
foundNonBlocking := false
73+
for _, p := range nonBlockingPatterns {
74+
if strings.Contains(source, p) {
75+
foundNonBlocking = true
76+
break
77+
}
78+
}
79+
80+
if !foundNonBlocking {
81+
t.Fatalf("BeforeClose cleanup appears to take a blocking lock on sessionsMutex; add a non-blocking guard to prevent pool.Close deadlocks")
82+
}
83+
}
84+
5585
// TestDbClient_Close_Idempotent verifies that calling Close() multiple times does not cause issues
5686
// Reference: Similar to bug #4712 (Result.Close() idempotency)
5787
//
@@ -284,13 +314,14 @@ func TestDbClient_SessionsMutexProtectsMap(t *testing.T) {
284314

285315
sourceCode := string(content)
286316

287-
// Count occurrences of mutex locks
288-
mutexLocks := strings.Count(sourceCode, "c.sessionsMutex.Lock()")
317+
// Count occurrences of mutex lock helpers
318+
mutexLocks := strings.Count(sourceCode, "lockSessions()") +
319+
strings.Count(sourceCode, "sessionsTryLock()")
289320

290321
// This is a heuristic check - in practice, we'd need more sophisticated analysis
291322
// But it serves as a reminder to use the mutex
292323
assert.True(t, mutexLocks > 0,
293-
"sessionsMutex.Lock() should be used when accessing sessions map")
324+
"sessions lock helpers should be used when accessing sessions map")
294325
}
295326

296327
// TestDbClient_SessionMapDocumentation verifies that session lifecycle is documented

pkg/pluginmanager_service/plugin_manager.go

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,29 @@ type PluginManager struct {
5151
// map of max cache size, keyed by plugin instance
5252
pluginCacheSizeMap map[string]int64
5353

54-
// map lock
54+
// mut protects concurrent access to plugin manager state (runningPluginMap, connectionConfigMap, etc.)
55+
//
56+
// LOCKING PATTERN TO PREVENT DEADLOCKS:
57+
// - Functions that acquire mut.Lock() and call other methods MUST only call *Internal versions
58+
// - Public methods that need locking: acquire lock → call internal version → release lock
59+
// - Internal methods: assume caller holds lock, never acquire lock themselves
60+
//
61+
// Example:
62+
// func (m *PluginManager) SomeMethod() {
63+
// m.mut.Lock()
64+
// defer m.mut.Unlock()
65+
// return m.someMethodInternal()
66+
// }
67+
// func (m *PluginManager) someMethodInternal() {
68+
// // NOTE: caller must hold m.mut lock
69+
// // ... implementation without locking ...
70+
// }
71+
//
72+
// Functions with internal/external versions:
73+
// - refreshRateLimiterTable / refreshRateLimiterTableInternal
74+
// - updateRateLimiterStatus / updateRateLimiterStatusInternal
75+
// - setRateLimiters / setRateLimitersInternal
76+
// - getPluginsWithChangedLimiters / getPluginsWithChangedLimitersInternal
5577
mut sync.RWMutex
5678

5779
// shutdown synchronization
@@ -231,23 +253,32 @@ func (m *PluginManager) doRefresh() {
231253

232254
// OnConnectionConfigChanged is the callback function invoked by the connection watcher when the config changed
233255
func (m *PluginManager) OnConnectionConfigChanged(ctx context.Context, configMap connection.ConnectionConfigMap, plugins map[string]*plugin.Plugin) {
256+
log.Printf("[DEBUG] OnConnectionConfigChanged: acquiring lock")
234257
m.mut.Lock()
235258
defer m.mut.Unlock()
259+
log.Printf("[DEBUG] OnConnectionConfigChanged: lock acquired")
236260

237261
log.Printf("[TRACE] OnConnectionConfigChanged: connections: %s plugin instances: %s", strings.Join(utils.SortedMapKeys(configMap), ","), strings.Join(utils.SortedMapKeys(plugins), ","))
238262

263+
log.Printf("[DEBUG] OnConnectionConfigChanged: calling handleConnectionConfigChanges")
239264
if err := m.handleConnectionConfigChanges(ctx, configMap); err != nil {
240265
log.Printf("[WARN] handleConnectionConfigChanges failed: %s", err.Error())
241266
}
267+
log.Printf("[DEBUG] OnConnectionConfigChanged: handleConnectionConfigChanges complete")
242268

243269
// update our plugin configs
270+
log.Printf("[DEBUG] OnConnectionConfigChanged: calling handlePluginInstanceChanges")
244271
if err := m.handlePluginInstanceChanges(ctx, plugins); err != nil {
245272
log.Printf("[WARN] handlePluginInstanceChanges failed: %s", err.Error())
246273
}
274+
log.Printf("[DEBUG] OnConnectionConfigChanged: handlePluginInstanceChanges complete")
247275

276+
log.Printf("[DEBUG] OnConnectionConfigChanged: calling handleUserLimiterChanges")
248277
if err := m.handleUserLimiterChanges(ctx, plugins); err != nil {
249278
log.Printf("[WARN] handleUserLimiterChanges failed: %s", err.Error())
250279
}
280+
log.Printf("[DEBUG] OnConnectionConfigChanged: handleUserLimiterChanges complete")
281+
log.Printf("[DEBUG] OnConnectionConfigChanged: about to release lock and return")
251282
}
252283

253284
func (m *PluginManager) GetConnectionConfig() connection.ConnectionConfigMap {
@@ -776,14 +807,19 @@ func (m *PluginManager) setCacheOptions(pluginClient *sdkgrpc.PluginClient) erro
776807
}
777808

778809
func (m *PluginManager) setRateLimiters(pluginInstance string, pluginClient *sdkgrpc.PluginClient) error {
810+
m.mut.RLock()
811+
defer m.mut.RUnlock()
812+
return m.setRateLimitersInternal(pluginInstance, pluginClient)
813+
}
814+
815+
func (m *PluginManager) setRateLimitersInternal(pluginInstance string, pluginClient *sdkgrpc.PluginClient) error {
816+
// NOTE: caller must hold m.mut lock (at least RLock)
779817
log.Printf("[INFO] setRateLimiters for plugin '%s'", pluginInstance)
780818
var defs []*sdkproto.RateLimiterDefinition
781819

782-
m.mut.RLock()
783820
for _, l := range m.userLimiters[pluginInstance] {
784821
defs = append(defs, RateLimiterAsProto(l))
785822
}
786-
m.mut.RUnlock()
787823

788824
req := &sdkproto.SetRateLimitersRequest{Definitions: defs}
789825

0 commit comments

Comments
 (0)