Skip to content

Commit a59b59c

Browse files
authored
bugfix stuck fetchstats on slow views/processors (#439)
1 parent 612fe9a commit a59b59c

File tree

6 files changed

+116
-225
lines changed

6 files changed

+116
-225
lines changed

context_test.go

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -327,8 +327,7 @@ func TestContext_Set(t *testing.T) {
327327
st: &storageProxy{
328328
Storage: st,
329329
},
330-
stats: newTableStats(),
331-
updateStats: make(chan func(), 10),
330+
stats: newTableStats(),
332331
}
333332
)
334333
st.EXPECT().Set(key, []byte(value)).Return(nil)
@@ -376,9 +375,8 @@ func TestContext_GetSetStateful(t *testing.T) {
376375
st: &storageProxy{
377376
Storage: st,
378377
},
379-
state: newPartitionTableState().SetState(State(PartitionRunning)),
380-
stats: newTableStats(),
381-
updateStats: make(chan func(), 10),
378+
state: newPartitionTableState().SetState(State(PartitionRunning)),
379+
stats: newTableStats(),
382380
}
383381
)
384382

partition_processor.go

Lines changed: 54 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,8 @@ type PartitionProcessor struct {
8080
consumer sarama.Consumer
8181
tmgr TopicManager
8282

83-
stats *PartitionProcStats
84-
requestStats chan bool
85-
responseStats chan *PartitionProcStats
86-
updateStats chan func()
87-
cancelStatsLoop context.CancelFunc
83+
mStats sync.RWMutex
84+
stats *PartitionProcStats
8885

8986
commit commitCallback
9087
producer Producer
@@ -137,38 +134,32 @@ func newPartitionProcessor(partition int32,
137134

138135
log := logger.Prefix(fmt.Sprintf("PartitionProcessor (%d)", partition))
139136

140-
statsLoopCtx, cancel := context.WithCancel(context.Background())
141-
142137
for _, v := range graph.visitors {
143138
visitCallbacks[v.(*visitor).name] = v.(*visitor).cb
144139
}
145140

146141
partProc := &PartitionProcessor{
147-
log: log,
148-
opts: opts,
149-
partition: partition,
150-
state: NewSignal(PPStateIdle, PPStateRecovering, PPStateRunning, PPStateStopping, PPStateStopped).SetState(PPStateIdle),
151-
callbacks: callbacks,
152-
lookups: lookupTables,
153-
consumer: consumer,
154-
producer: producer,
155-
tmgr: tmgr,
156-
joins: make(map[string]*PartitionTable),
157-
input: make(chan *message, opts.partitionChannelSize),
158-
inputTopics: topicList,
159-
visitInput: make(chan *visit, defaultPPVisitChannelSize),
160-
visitCallbacks: visitCallbacks,
161-
graph: graph,
162-
stats: newPartitionProcStats(topicList, outputList),
163-
requestStats: make(chan bool),
164-
responseStats: make(chan *PartitionProcStats, 1),
165-
updateStats: make(chan func(), 10),
166-
cancelStatsLoop: cancel,
167-
commit: commit,
168-
runMode: runMode,
169-
}
170-
171-
go partProc.runStatsLoop(statsLoopCtx)
142+
log: log,
143+
opts: opts,
144+
partition: partition,
145+
state: NewSignal(PPStateIdle, PPStateRecovering, PPStateRunning, PPStateStopping, PPStateStopped).SetState(PPStateIdle),
146+
callbacks: callbacks,
147+
lookups: lookupTables,
148+
consumer: consumer,
149+
producer: producer,
150+
tmgr: tmgr,
151+
joins: make(map[string]*PartitionTable),
152+
input: make(chan *message, opts.partitionChannelSize),
153+
inputTopics: topicList,
154+
visitInput: make(chan *visit, defaultPPVisitChannelSize),
155+
visitCallbacks: visitCallbacks,
156+
graph: graph,
157+
158+
stats: newPartitionProcStats(topicList, outputList),
159+
160+
commit: commit,
161+
runMode: runMode,
162+
}
172163

173164
if graph.GroupTable() != nil {
174165
partProc.table = newPartitionTable(graph.GroupTable().Topic(),
@@ -217,7 +208,6 @@ func (pp *PartitionProcessor) Start(setupCtx, ctx context.Context) error {
217208
defer pp.state.SetState(PPStateRunning)
218209

219210
if pp.table != nil {
220-
go pp.table.RunStatsLoop(runnerCtx)
221211
setupErrg.Go(func() error {
222212
pp.log.Debugf("catching up table")
223213
defer pp.log.Debugf("catching up table done")
@@ -238,7 +228,6 @@ func (pp *PartitionProcessor) Start(setupCtx, ctx context.Context) error {
238228
)
239229
pp.joins[join.Topic()] = table
240230

241-
go table.RunStatsLoop(runnerCtx)
242231
setupErrg.Go(func() error {
243232
return table.SetupAndRecover(setupCtx, false)
244233
})
@@ -316,9 +305,6 @@ func (pp *PartitionProcessor) Stop() error {
316305
pp.cancelRunnerGroup()
317306
}
318307

319-
// stop the stats updating/serving loop
320-
pp.cancelStatsLoop()
321-
322308
// wait for the runner to be done
323309
runningErrs := multierror.Append(pp.runnerGroup.Wait().ErrorOrNil())
324310

@@ -413,6 +399,9 @@ func (pp *PartitionProcessor) run(ctx context.Context) (rerr error) {
413399
}
414400
}()
415401

402+
updateHwmStatsTicker := time.NewTicker(statsHwmUpdateInterval)
403+
defer updateHwmStatsTicker.Stop()
404+
416405
for {
417406
select {
418407
case ev, isOpen := <-pp.input:
@@ -425,7 +414,15 @@ func (pp *PartitionProcessor) run(ctx context.Context) (rerr error) {
425414
return newErrProcessing(pp.partition, err)
426415
}
427416

428-
pp.enqueueStatsUpdate(ctx, func() { pp.updateStatsWithMessage(ev) })
417+
pp.updateStats(func(stats *PartitionProcStats) {
418+
ip := stats.Input[ev.topic]
419+
ip.Bytes += len(ev.value)
420+
ip.LastOffset = ev.offset
421+
if !ev.timestamp.IsZero() {
422+
ip.Delay = time.Since(ev.timestamp)
423+
}
424+
ip.Count++
425+
})
429426
case <-ctx.Done():
430427
pp.log.Debugf("exiting, context is cancelled")
431428
return
@@ -440,75 +437,42 @@ func (pp *PartitionProcessor) run(ctx context.Context) (rerr error) {
440437
case <-asyncErrs:
441438
pp.log.Debugf("Errors occurred asynchronously. Will exit partition processor")
442439
return
443-
}
444-
}
445-
}
446-
447-
func (pp *PartitionProcessor) enqueueStatsUpdate(ctx context.Context, updater func()) {
448-
select {
449-
case pp.updateStats <- updater:
450-
case <-ctx.Done():
451-
default:
452-
// going to default indicates the updateStats channel is not read, so so the stats
453-
// loop is not actually running.
454-
// We must not block here, so we'll skip the update
455-
}
456-
}
457-
458-
func (pp *PartitionProcessor) runStatsLoop(ctx context.Context) {
459-
updateHwmStatsTicker := time.NewTicker(statsHwmUpdateInterval)
460-
defer updateHwmStatsTicker.Stop()
461-
for {
462-
select {
463-
case <-pp.requestStats:
464-
stats := pp.collectStats(ctx)
465-
select {
466-
case pp.responseStats <- stats:
467-
case <-ctx.Done():
468-
pp.log.Debugf("exiting, context is cancelled")
469-
return
470-
}
471-
case update := <-pp.updateStats:
472-
update()
473440
case <-updateHwmStatsTicker.C:
474-
pp.updateHwmStats()
475-
case <-ctx.Done():
476-
return
441+
pp.updateStats(pp.updateHwmStats)
477442
}
478443
}
479444
}
480445

481-
// updateStatsWithMessage updates the stats with a received message
482-
func (pp *PartitionProcessor) updateStatsWithMessage(ev *message) {
483-
ip := pp.stats.Input[ev.topic]
484-
ip.Bytes += len(ev.value)
485-
ip.LastOffset = ev.offset
486-
if !ev.timestamp.IsZero() {
487-
ip.Delay = time.Since(ev.timestamp)
488-
}
489-
ip.Count++
446+
func (pp *PartitionProcessor) updateStats(updater func(stats *PartitionProcStats)) {
447+
pp.mStats.Lock()
448+
defer pp.mStats.Unlock()
449+
updater(pp.stats)
490450
}
491451

492452
// updateHwmStats updates the offset lag for all input topics based on the
493453
// highwatermarks obtained by the consumer.
494-
func (pp *PartitionProcessor) updateHwmStats() {
454+
func (pp *PartitionProcessor) updateHwmStats(stats *PartitionProcStats) {
495455
hwms := pp.consumer.HighWaterMarks()
496-
for input, inputStats := range pp.stats.Input {
456+
for input, inputStats := range stats.Input {
497457
hwm := hwms[input][pp.partition]
498458
if hwm != 0 && inputStats.LastOffset != 0 {
499459
inputStats.OffsetLag = hwm - inputStats.LastOffset
500460
}
501461
}
502462
}
503463

504-
func (pp *PartitionProcessor) collectStats(ctx context.Context) *PartitionProcStats {
505-
var (
506-
stats = pp.stats.clone()
507-
m sync.Mutex
508-
)
464+
func (pp *PartitionProcessor) fetchStats(ctx context.Context) *PartitionProcStats {
465+
pp.mStats.RLock()
466+
stats := pp.stats.clone()
467+
pp.mStats.RUnlock()
468+
469+
// mutex for the local stats-clone so the
470+
// error group below doesn't get a concurrent-map-access error
471+
var m sync.Mutex
509472

510473
errg, ctx := multierr.NewErrGroup(ctx)
511474

475+
// fetch join table stats
512476
for topic, join := range pp.joins {
513477
topic, join := topic, join
514478
errg.Go(func() error {
@@ -523,6 +487,7 @@ func (pp *PartitionProcessor) collectStats(ctx context.Context) *PartitionProcSt
523487
})
524488
}
525489

490+
// if we have processor state, get those stats
526491
if pp.table != nil {
527492
errg.Go(func() error {
528493
stats.TableStats = pp.table.fetchStats(ctx)
@@ -541,31 +506,9 @@ func (pp *PartitionProcessor) collectStats(ctx context.Context) *PartitionProcSt
541506
return stats
542507
}
543508

544-
func (pp *PartitionProcessor) fetchStats(ctx context.Context) *PartitionProcStats {
545-
select {
546-
case <-ctx.Done():
547-
return nil
548-
case <-time.After(fetchStatsTimeout):
549-
pp.log.Printf("requesting stats timed out")
550-
return nil
551-
case pp.requestStats <- true:
552-
}
553-
554-
// retrieve from response-channel
555-
select {
556-
case <-ctx.Done():
557-
return nil
558-
case <-time.After(fetchStatsTimeout):
559-
pp.log.Printf("Fetching stats timed out")
560-
return nil
561-
case stats := <-pp.responseStats:
562-
return stats
563-
}
564-
}
565-
566509
func (pp *PartitionProcessor) enqueueTrackOutputStats(ctx context.Context, topic string, size int) {
567-
pp.enqueueStatsUpdate(ctx, func() {
568-
pp.stats.trackOutput(topic, size)
510+
pp.updateStats(func(stats *PartitionProcStats) {
511+
stats.trackOutput(topic, size)
569512
})
570513
}
571514

0 commit comments

Comments
 (0)