Skip to content

Commit 6334df4

Browse files
committed
Improved failure handling
fully remove multierr.Errors system test for testing connection errors bugfix issue 350 View.Get fails on reconnecting view bugfix processor freeze on error (334)
1 parent 9ff2ce3 commit 6334df4

17 files changed

+731
-209
lines changed

Makefile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
2+
3+
test:
4+
go test -race ./...
5+
6+
test-systemtest:
7+
GOKA_SYSTEMTEST=y go test -v github.com/lovoo/goka/systemtest
8+
9+
test-all: test test-systemtest

context.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import (
88
"time"
99

1010
"github.com/Shopify/sarama"
11-
"github.com/lovoo/goka/multierr"
11+
"github.com/hashicorp/go-multierror"
1212
)
1313

1414
type emitter func(topic string, key string, value []byte, headers Headers) *Promise
@@ -177,7 +177,7 @@ type cbContext struct {
177177
dones int
178178
stores int
179179
}
180-
errors multierr.Errors
180+
errors *multierror.Error
181181
m sync.Mutex
182182
wg *sync.WaitGroup
183183
}
@@ -438,7 +438,7 @@ func (ctx *cbContext) start() {
438438
// this function must be called from a locked function.
439439
func (ctx *cbContext) tryCommit(err error) {
440440
if err != nil {
441-
_ = ctx.errors.Collect(err)
441+
ctx.errors = multierror.Append(ctx.errors, err)
442442
}
443443

444444
// not all calls are done yet, do not send the ack upstream.
@@ -447,8 +447,8 @@ func (ctx *cbContext) tryCommit(err error) {
447447
}
448448

449449
// commit if no errors, otherwise fail context
450-
if ctx.errors.HasErrors() {
451-
ctx.asyncFailer(ctx.errors.NilOrError())
450+
if ctx.errors.ErrorOrNil() != nil {
451+
ctx.asyncFailer(ctx.errors.ErrorOrNil())
452452
} else {
453453
ctx.commit()
454454
}

context_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ func TestContext_DeferCommit_witherror(t *testing.T) {
121121
doneFunc(fmt.Errorf("async error"))
122122
// no commit, no ack, so we'll get the message again.
123123
test.AssertEqual(t, ack, 0)
124-
test.AssertEqual(t, ctx.errors.NilOrError().Error(), "async error")
124+
test.AssertStringContains(t, ctx.errors.ErrorOrNil().Error(), "async error")
125125
}
126126

127127
func TestContext_Timestamp(t *testing.T) {
@@ -584,6 +584,7 @@ func TestContext_Lookup(t *testing.T) {
584584
msg: &message{key: key},
585585
views: map[string]*View{
586586
string(table): {
587+
state: newViewSignal().SetState(State(ViewStateRunning)),
587588
opts: &voptions{
588589
tableCodec: c,
589590
hasher: DefaultHasher(),

partition_table.go

Lines changed: 44 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"time"
77

88
"github.com/Shopify/sarama"
9+
"github.com/hashicorp/go-multierror"
910
"github.com/lovoo/goka/multierr"
1011
"github.com/lovoo/goka/storage"
1112
)
@@ -17,6 +18,8 @@ const (
1718

1819
// internal offset we use to detect if the offset has never been stored locally
1920
offsetNotStored int64 = -3
21+
22+
consumerDrainTimeout = time.Second
2023
)
2124

2225
// Backoff is used for adding backoff capabilities to the restarting
@@ -191,7 +194,6 @@ func (p *PartitionTable) Close() error {
191194
func (p *PartitionTable) createStorage(ctx context.Context) (*storageProxy, error) {
192195
var (
193196
err error
194-
errs = new(multierr.Errors)
195197
st storage.Storage
196198
start = time.Now()
197199
done = make(chan struct{})
@@ -217,9 +219,7 @@ func (p *PartitionTable) createStorage(ctx context.Context) (*storageProxy, erro
217219
}
218220
err = st.Open()
219221
if err != nil {
220-
errs.Collect(st.Close())
221-
errs.Collect(fmt.Errorf("error opening storage: %v", err))
222-
return nil, errs.NilOrError()
222+
return nil, multierror.Append(st.Close(), fmt.Errorf("error opening storage: %v", err)).ErrorOrNil()
223223
}
224224

225225
// close the db if context was cancelled before the builder returned
@@ -228,10 +228,9 @@ func (p *PartitionTable) createStorage(ctx context.Context) (*storageProxy, erro
228228
err = st.Close()
229229
// only collect context error if Close() errored out
230230
if err != nil {
231-
errs.Collect(err)
232-
errs.Collect(ctx.Err())
231+
return nil, multierror.Append(err, ctx.Err()).ErrorOrNil()
233232
}
234-
return nil, errs.NilOrError()
233+
return nil, nil
235234
default:
236235
}
237236

@@ -280,37 +279,25 @@ func (p *PartitionTable) load(ctx context.Context, stopAfterCatchup bool) (rerr
280279
storedOffset int64
281280
partConsumer sarama.PartitionConsumer
282281
err error
283-
errs = new(multierr.Errors)
284282
)
285283
ctx, cancel := context.WithCancel(ctx)
286284
defer cancel()
287285

288-
// deferred error handling
289-
defer func() {
290-
errs.Collect(rerr)
291-
292-
rerr = errs.NilOrError()
293-
return
294-
}()
295-
296286
p.state.SetState(State(PartitionConnecting))
297287

298288
// fetch local offset
299289
storedOffset, err = p.st.GetOffset(offsetNotStored)
300290
if err != nil {
301-
errs.Collect(fmt.Errorf("error reading local offset: %v", err))
302-
return
291+
return fmt.Errorf("error reading local offset: %v", err)
303292
}
304293

305294
loadOffset, hwm, err := p.findOffsetToLoad(storedOffset)
306295
if err != nil {
307-
errs.Collect(err)
308-
return
296+
return err
309297
}
310298

311299
if storedOffset > 0 && hwm == 0 {
312-
errs.Collect(fmt.Errorf("kafka tells us there's no message in the topic, but our cache has one. The table might be gone. Try to delete your local cache! Topic %s, partition %d, hwm %d, local offset %d", p.topic, p.partition, hwm, storedOffset))
313-
return
300+
return fmt.Errorf("kafka tells us there's no message in the topic, but our cache has one. The table might be gone. Try to delete your local cache! Topic %s, partition %d, hwm %d, local offset %d", p.topic, p.partition, hwm, storedOffset)
314301
}
315302

316303
if storedOffset >= hwm {
@@ -334,8 +321,7 @@ func (p *PartitionTable) load(ctx context.Context, stopAfterCatchup bool) (rerr
334321
// AND we're here for catchup, so let's stop here
335322
// and do not attempt to load anything
336323
if stopAfterCatchup && loadOffset >= hwm {
337-
errs.Collect(p.markRecovered(ctx))
338-
return
324+
return p.markRecovered(ctx)
339325
}
340326

341327
if stopAfterCatchup {
@@ -348,17 +334,13 @@ func (p *PartitionTable) load(ctx context.Context, stopAfterCatchup bool) (rerr
348334

349335
partConsumer, err = p.consumer.ConsumePartition(p.topic, p.partition, loadOffset)
350336
if err != nil {
351-
errs.Collect(fmt.Errorf("Error creating partition consumer for topic %s, partition %d, offset %d: %v", p.topic, p.partition, storedOffset, err))
352-
return
337+
return fmt.Errorf("Error creating partition consumer for topic %s, partition %d, offset %d: %v", p.topic, p.partition, storedOffset, err)
353338
}
354339

355-
// consume errors asynchronously
356-
go p.handleConsumerErrors(ctx, errs, partConsumer)
357-
358340
// close the consumer
359341
defer func() {
360342
partConsumer.AsyncClose()
361-
p.drainConsumer(partConsumer, errs)
343+
rerr = multierror.Append(rerr, p.drainConsumer(partConsumer)).ErrorOrNil()
362344
}()
363345

364346
if stopAfterCatchup {
@@ -371,15 +353,13 @@ func (p *PartitionTable) load(ctx context.Context, stopAfterCatchup bool) (rerr
371353
loadErr := p.loadMessages(ctx, partConsumer, hwm, stopAfterCatchup)
372354

373355
if loadErr != nil {
374-
errs.Collect(loadErr)
375-
return
356+
return loadErr
376357
}
377358

378359
if stopAfterCatchup {
379-
errs.Collect(p.markRecovered(ctx))
380-
381-
now := time.Now()
382-
p.enqueueStatsUpdate(ctx, func() { p.stats.Recovery.RecoveryTime = now })
360+
err := p.markRecovered(ctx)
361+
p.enqueueStatsUpdate(ctx, func() { p.stats.Recovery.RecoveryTime = time.Now() })
362+
return err
383363
}
384364
return
385365
}
@@ -424,43 +404,27 @@ func (p *PartitionTable) markRecovered(ctx context.Context) error {
424404
}
425405
}
426406

427-
func (p *PartitionTable) handleConsumerErrors(ctx context.Context, errs *multierr.Errors, cons sarama.PartitionConsumer) {
428-
for {
429-
select {
430-
case consError, ok := <-cons.Errors():
431-
if !ok {
432-
return
433-
}
434-
err := fmt.Errorf("Consumer error: %v", consError)
435-
p.log.Printf("%v", err)
436-
errs.Collect(err)
437-
// if there's an error, close the consumer
438-
cons.AsyncClose()
439-
case <-ctx.Done():
440-
return
441-
}
442-
}
443-
}
407+
func (p *PartitionTable) drainConsumer(cons sarama.PartitionConsumer) error {
444408

445-
func (p *PartitionTable) drainConsumer(cons sarama.PartitionConsumer, errs *multierr.Errors) {
446-
447-
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
409+
timeoutCtx, cancel := context.WithTimeout(context.Background(), consumerDrainTimeout)
448410
defer cancel()
449411

450-
errg, ctx := multierr.NewErrGroup(ctx)
412+
errg, _ := multierr.NewErrGroup(context.Background())
451413

452414
// drain errors channel
453415
errg.Go(func() error {
416+
var errs *multierror.Error
417+
454418
for {
455419
select {
456-
case <-ctx.Done():
420+
case <-timeoutCtx.Done():
457421
p.log.Printf("draining errors channel timed out")
458-
return nil
422+
return errs
459423
case err, ok := <-cons.Errors():
460424
if !ok {
461-
return nil
425+
return errs
462426
}
463-
errs.Collect(err)
427+
errs = multierror.Append(errs, err)
464428
}
465429
}
466430
})
@@ -469,7 +433,7 @@ func (p *PartitionTable) drainConsumer(cons sarama.PartitionConsumer, errs *mult
469433
errg.Go(func() error {
470434
for {
471435
select {
472-
case <-ctx.Done():
436+
case <-timeoutCtx.Done():
473437
p.log.Printf("draining messages channel timed out")
474438
return nil
475439
case _, ok := <-cons.Messages():
@@ -480,30 +444,31 @@ func (p *PartitionTable) drainConsumer(cons sarama.PartitionConsumer, errs *mult
480444
}
481445
})
482446

483-
errg.Wait()
447+
return errg.Wait().ErrorOrNil()
484448
}
485449

486-
func (p *PartitionTable) loadMessages(ctx context.Context, cons sarama.PartitionConsumer, partitionHwm int64, stopAfterCatchup bool) (rerr error) {
487-
errs := new(multierr.Errors)
488-
489-
// deferred error handling
490-
defer func() {
491-
errs.Collect(rerr)
492-
493-
rerr = errs.NilOrError()
494-
return
495-
}()
450+
func (p *PartitionTable) loadMessages(ctx context.Context, cons sarama.PartitionConsumer, partitionHwm int64, stopAfterCatchup bool) error {
496451

497452
stallTicker := time.NewTicker(p.stallPeriod)
498453
defer stallTicker.Stop()
499454

500455
lastMessage := time.Now()
501456

457+
messages := cons.Messages()
458+
errors := cons.Errors()
459+
502460
for {
503461
select {
504-
case msg, ok := <-cons.Messages():
462+
case err, ok := <-errors:
505463
if !ok {
506-
return
464+
return nil
465+
}
466+
if err != nil {
467+
return err
468+
}
469+
case msg, ok := <-messages:
470+
if !ok {
471+
return nil
507472
}
508473

509474
// This case is for the Tester to achieve synchronity.
@@ -521,8 +486,7 @@ func (p *PartitionTable) loadMessages(ctx context.Context, cons sarama.Partition
521486

522487
lastMessage = time.Now()
523488
if err := p.storeEvent(string(msg.Key), msg.Value, msg.Offset, msg.Headers); err != nil {
524-
errs.Collect(fmt.Errorf("load: error updating storage: %v", err))
525-
return
489+
return fmt.Errorf("load: error updating storage: %v", err)
526490
}
527491

528492
if stopAfterCatchup {
@@ -532,7 +496,7 @@ func (p *PartitionTable) loadMessages(ctx context.Context, cons sarama.Partition
532496
p.enqueueStatsUpdate(ctx, func() { p.trackIncomingMessageStats(msg) })
533497

534498
if stopAfterCatchup && msg.Offset >= partitionHwm-1 {
535-
return
499+
return nil
536500
}
537501

538502
case now := <-stallTicker.C:
@@ -543,7 +507,7 @@ func (p *PartitionTable) loadMessages(ctx context.Context, cons sarama.Partition
543507
}
544508

545509
case <-ctx.Done():
546-
return
510+
return nil
547511
}
548512
}
549513
}
@@ -711,7 +675,7 @@ func (p *PartitionTable) IteratorWithRange(start []byte, limit []byte) (storage.
711675

712676
func (p *PartitionTable) readyToRead() error {
713677
pstate := p.CurrentState()
714-
if pstate != PartitionRunning {
678+
if pstate < PartitionConnecting {
715679
return fmt.Errorf("Partition is not running (but %v) so it's not safe to read values", pstate)
716680
}
717681
return nil

0 commit comments

Comments
 (0)