Skip to content

Commit 6825379

Browse files
committed
Fix risk of a partial write txn being applied
Signed-off-by: Shyam Jeedigunta <[email protected]>
1 parent 3d6ff97 commit 6825379

File tree

2 files changed

+34
-7
lines changed

2 files changed

+34
-7
lines changed

server/etcdserver/txn/txn.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -307,10 +307,11 @@ func txn(ctx context.Context, lg *zap.Logger, txnWrite mvcc.TxnWrite, rt *pb.Txn
307307
_, err := executeTxn(ctx, lg, txnWrite, rt, txnPath, txnResp)
308308
if err != nil {
309309
if isWrite {
310-
// end txn to release locks before panic
311-
txnWrite.End()
312-
// When txn with write operations starts it has to be successful
313-
// We don't have a way to recover state in case of write failure
310+
// CAUTION: When a txn performing write operations starts, we always expect it to be successful.
311+
// If a write failure is seen we SHOULD NOT try to recover the server, but crash with a panic to make the failure explicit.
312+
// Trying to silently recover (e.g by ignoring the failed txn or calling txn.End() early) poses serious risks:
313+
// - violation of transaction atomicity if some write operations have been partially executed
314+
// - data inconsistency across different etcd members if they applied the txn asymmetrically
314315
lg.Panic("unexpected error during txn with writes", zap.Error(err))
315316
} else {
316317
lg.Error("unexpected error during readonly txn", zap.Error(err))

server/etcdserver/txn/txn_test.go

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ package txn
1616

1717
import (
1818
"context"
19+
"crypto/sha256"
20+
"io"
21+
"os"
1922
"strings"
2023
"testing"
2124
"time"
@@ -336,9 +339,8 @@ func TestReadonlyTxnError(t *testing.T) {
336339
}
337340
}
338341

339-
func TestWriteTxnPanic(t *testing.T) {
340-
b, _ := betesting.NewDefaultTmpBackend(t)
341-
defer betesting.Close(t, b)
342+
func TestWriteTxnPanicWithoutApply(t *testing.T) {
343+
b, bePath := betesting.NewDefaultTmpBackend(t)
342344
s := mvcc.NewStore(zaptest.NewLogger(t), b, &lease.FakeLessor{}, mvcc.StoreConfig{})
343345
defer s.Close()
344346

@@ -367,7 +369,17 @@ func TestWriteTxnPanic(t *testing.T) {
367369
},
368370
}
369371

372+
// compute DB file hash before applying the txn
373+
dbHashBefore, err := computeFileHash(bePath)
374+
require.NoErrorf(t, err, "failed to compute DB file hash before txn")
375+
376+
// we verify the following properties below:
377+
// 1. server panics after a write txn aply fails (invariant: server should never try to move on from a failed write)
378+
// 2. no writes from the txn are applied to the backend (invariant: failed write should have no side-effect on DB state besides panic)
370379
assert.Panics(t, func() { Txn(ctx, zaptest.NewLogger(t), txn, false, s, &lease.FakeLessor{}) }, "Expected panic in Txn with writes")
380+
dbHashAfter, err := computeFileHash(bePath)
381+
require.NoErrorf(t, err, "failed to compute DB file hash after txn")
382+
require.Equalf(t, dbHashBefore, dbHashAfter, "mismatch in DB hash before and after failed write txn")
371383
}
372384

373385
func TestCheckTxnAuth(t *testing.T) {
@@ -566,6 +578,20 @@ func setupAuth(t *testing.T, be backend.Backend) auth.AuthStore {
566578
return as
567579
}
568580

581+
func computeFileHash(filePath string) (string, error) {
582+
file, err := os.Open(filePath)
583+
if err != nil {
584+
return "", err
585+
}
586+
defer file.Close()
587+
588+
h := sha256.New()
589+
if _, err := io.Copy(h, file); err != nil {
590+
return "", err
591+
}
592+
return string(h.Sum(nil)), nil
593+
}
594+
569595
// CheckTxnAuth variables setup.
570596
var (
571597
inRangeCompare = &pb.Compare{

0 commit comments

Comments
 (0)