@@ -208,6 +208,7 @@ func (s *sink) maybeBackoff() {
208
208
select {
209
209
case <- after .C :
210
210
case <- s .cl .ctx .Done ():
211
+ case <- s .anyCtx ().Done ():
211
212
}
212
213
}
213
214
@@ -247,6 +248,34 @@ func (s *sink) drain() {
247
248
}
248
249
}
249
250
251
+ // Returns the first context encountered ranging across all records.
252
+ // This does not use defers to make it clear at the return that all
253
+ // unlocks are called in proper order. Ideally, do not call this func
254
+ // due to lock intensity.
255
+ func (s * sink ) anyCtx () context.Context {
256
+ s .recBufsMu .Lock ()
257
+ for _ , recBuf := range s .recBufs {
258
+ recBuf .mu .Lock ()
259
+ if len (recBuf .batches ) > 0 {
260
+ batch0 := recBuf .batches [0 ]
261
+ batch0 .mu .Lock ()
262
+ if batch0 .canFailFromLoadErrs && len (batch0 .records ) > 0 {
263
+ r0 := batch0 .records [0 ]
264
+ if rctx := r0 .cancelingCtx (); rctx != nil {
265
+ batch0 .mu .Unlock ()
266
+ recBuf .mu .Unlock ()
267
+ s .recBufsMu .Unlock ()
268
+ return rctx
269
+ }
270
+ }
271
+ batch0 .mu .Unlock ()
272
+ }
273
+ recBuf .mu .Unlock ()
274
+ }
275
+ s .recBufsMu .Unlock ()
276
+ return context .Background ()
277
+ }
278
+
250
279
func (s * sink ) produce (sem <- chan struct {}) bool {
251
280
var produced bool
252
281
defer func () {
@@ -267,6 +296,7 @@ func (s *sink) produce(sem <-chan struct{}) bool {
267
296
// - auth failure
268
297
// - transactional: a produce failure that failed the producer ID
269
298
// - AddPartitionsToTxn failure (see just below)
299
+ // - some head-of-line context failure
270
300
//
271
301
// All but the first error is fatal. Recovery may be possible with
272
302
// EndTransaction in specific cases, but regardless, all buffered
@@ -275,10 +305,71 @@ func (s *sink) produce(sem <-chan struct{}) bool {
275
305
// NOTE: we init the producer ID before creating a request to ensure we
276
306
// are always using the latest id/epoch with the proper sequence
277
307
// numbers. (i.e., resetAllSequenceNumbers && producerID logic combo).
278
- id , epoch , err := s .cl .producerID ()
308
+ //
309
+ // For the first-discovered-record-head-of-line context, we want to
310
+ // avoid looking it up if possible (which is why producerID takes a
311
+ // ctxFn). If we do use one, we want to be sure that the
312
+ // context.Canceled error is from *that* context rather than the client
313
+ // context or something else. So, we go through some special care to
314
+ // track setting the ctx / looking up if it is canceled.
315
+ var holCtxMu sync.Mutex
316
+ var holCtx context.Context
317
+ ctxFn := func () context.Context {
318
+ holCtxMu .Lock ()
319
+ defer holCtxMu .Unlock ()
320
+ holCtx = s .anyCtx ()
321
+ return holCtx
322
+ }
323
+ isHolCtxDone := func () bool {
324
+ holCtxMu .Lock ()
325
+ defer holCtxMu .Unlock ()
326
+ if holCtx == nil {
327
+ return false
328
+ }
329
+ select {
330
+ case <- holCtx .Done ():
331
+ return true
332
+ default :
333
+ }
334
+ return false
335
+ }
336
+
337
+ id , epoch , err := s .cl .producerID (ctxFn )
279
338
if err != nil {
339
+ var pe * errProducerIDLoadFail
280
340
switch {
281
- case errors .Is (err , errProducerIDLoadFail ):
341
+ case errors .As (err , & pe ):
342
+ if errors .Is (pe .err , context .Canceled ) && isHolCtxDone () {
343
+ // Some head-of-line record in a partition had a context cancelation.
344
+ // We look for any partition with HOL cancelations and fail them all.
345
+ s .cl .cfg .logger .Log (LogLevelInfo , "the first record in some partition(s) had a context cancelation; failing all relevant partitions" , "broker" , logID (s .nodeID ))
346
+ s .recBufsMu .Lock ()
347
+ defer s .recBufsMu .Unlock ()
348
+ for _ , recBuf := range s .recBufs {
349
+ recBuf .mu .Lock ()
350
+ var failAll bool
351
+ if len (recBuf .batches ) > 0 {
352
+ batch0 := recBuf .batches [0 ]
353
+ batch0 .mu .Lock ()
354
+ if batch0 .canFailFromLoadErrs && len (batch0 .records ) > 0 {
355
+ r0 := batch0 .records [0 ]
356
+ if rctx := r0 .cancelingCtx (); rctx != nil {
357
+ select {
358
+ case <- rctx .Done ():
359
+ failAll = true // we must not call failAllRecords here, because failAllRecords locks batches!
360
+ default :
361
+ }
362
+ }
363
+ }
364
+ batch0 .mu .Unlock ()
365
+ }
366
+ if failAll {
367
+ recBuf .failAllRecords (err )
368
+ }
369
+ recBuf .mu .Unlock ()
370
+ }
371
+ return true
372
+ }
282
373
s .cl .bumpRepeatedLoadErr (err )
283
374
s .cl .cfg .logger .Log (LogLevelWarn , "unable to load producer ID, bumping client's buffered record load errors by 1 and retrying" )
284
375
return true // whatever caused our produce, we did nothing, so keep going
@@ -385,6 +476,9 @@ func (s *sink) doSequenced(
385
476
promise : promise ,
386
477
}
387
478
479
+ // We can NOT use any record context. If we do, we force the request to
480
+ // fail while also force the batch to be unfailable (due to no
481
+ // response),
388
482
br , err := s .cl .brokerOrErr (s .cl .ctx , s .nodeID , errUnknownBroker )
389
483
if err != nil {
390
484
wait .err = err
@@ -432,6 +526,11 @@ func (s *sink) doTxnReq(
432
526
req .batches .eachOwnerLocked (seqRecBatch .removeFromTxn )
433
527
}
434
528
}()
529
+ // We do NOT let record context cancelations fail this request: doing
530
+ // so would put the transactional ID in an unknown state. This is
531
+ // similar to the warning we give in the txn.go file, but the
532
+ // difference there is the user knows explicitly at the function call
533
+ // that canceling the context will opt them into invalid state.
435
534
err = s .cl .doWithConcurrentTransactions (s .cl .ctx , "AddPartitionsToTxn" , func () error {
436
535
stripped , err = s .issueTxnReq (req , txnReq )
437
536
return err
@@ -1393,6 +1492,16 @@ type promisedRec struct {
1393
1492
* Record
1394
1493
}
1395
1494
1495
+ func (pr promisedRec ) cancelingCtx () context.Context {
1496
+ if pr .ctx .Done () != nil {
1497
+ return pr .ctx
1498
+ }
1499
+ if pr .Context .Done () != nil {
1500
+ return pr .Context
1501
+ }
1502
+ return nil
1503
+ }
1504
+
1396
1505
// recBatch is the type used for buffering records before they are written.
1397
1506
type recBatch struct {
1398
1507
owner * recBuf // who owns us
@@ -1421,10 +1530,12 @@ type recBatch struct {
1421
1530
// Returns an error if the batch should fail.
1422
1531
func (b * recBatch ) maybeFailErr (cfg * cfg ) error {
1423
1532
if len (b .records ) > 0 {
1424
- ctx := b .records [0 ]. ctx
1533
+ r0 := & b .records [0 ]
1425
1534
select {
1426
- case <- ctx .Done ():
1427
- return ctx .Err ()
1535
+ case <- r0 .ctx .Done ():
1536
+ return r0 .ctx .Err ()
1537
+ case <- r0 .Context .Done ():
1538
+ return r0 .Context .Err ()
1428
1539
default :
1429
1540
}
1430
1541
}
0 commit comments