Skip to content

Commit 23f887a

Browse files
authored
Merge pull request #18799 from shyamjvs/backport-3.5
[3.5] manual: Fix risk of a partial write txn being applied
2 parents 5a5408b + dea4eb3 commit 23f887a

File tree

2 files changed

+34
-6
lines changed

2 files changed

+34
-6
lines changed

server/etcdserver/apply.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -478,10 +478,11 @@ func (a *applierV3backend) Txn(ctx context.Context, rt *pb.TxnRequest) (*pb.TxnR
478478
_, err := a.applyTxn(ctx, txn, rt, txnPath, txnResp)
479479
if err != nil {
480480
if isWrite {
481-
// end txn to release locks before panic
482-
txn.End()
483-
// When txn with write operations starts it has to be successful
484-
// We don't have a way to recover state in case of write failure
481+
// CAUTION: When a txn performing write operations starts, we always expect it to be successful.
482+
// If a write failure is seen we SHOULD NOT try to recover the server, but crash with a panic to make the failure explicit.
483+
// Trying to silently recover (e.g by ignoring the failed txn or calling txn.End() early) poses serious risks:
484+
// - violation of transaction atomicity if some write operations have been partially executed
485+
// - data inconsistency across different etcd members if they applied the txn asymmetrically
485486
lg.Panic("unexpected error during txn with writes", zap.Error(err))
486487
} else {
487488
lg.Error("unexpected error during readonly txn", zap.Error(err))

server/etcdserver/apply_test.go

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ package etcdserver
22

33
import (
44
"context"
5+
"crypto/sha256"
6+
"github.com/stretchr/testify/require"
7+
"io"
8+
"os"
59
"strings"
610
"sync"
711
"testing"
@@ -55,8 +59,7 @@ func TestReadonlyTxnError(t *testing.T) {
5559
}
5660

5761
func TestWriteTxnPanic(t *testing.T) {
58-
b, _ := betesting.NewDefaultTmpBackend(t)
59-
defer betesting.Close(t, b)
62+
b, bePath := betesting.NewDefaultTmpBackend(t)
6063
s := mvcc.New(zap.NewExample(), b, &lease.FakeLessor{}, mvcc.StoreConfig{})
6164
defer s.Close()
6265

@@ -92,5 +95,29 @@ func TestWriteTxnPanic(t *testing.T) {
9295
},
9396
}
9497

98+
// compute DB file hash before applying the txn
99+
dbHashBefore, err := computeFileHash(bePath)
100+
require.NoErrorf(t, err, "failed to compute DB file hash before txn")
101+
102+
// we verify the following properties below:
103+
// 1. server panics after a write txn aply fails (invariant: server should never try to move on from a failed write)
104+
// 2. no writes from the txn are applied to the backend (invariant: failed write should have no side-effect on DB state besides panic)
95105
assert.Panics(t, func() { a.Txn(ctx, txn) }, "Expected panic in Txn with writes")
106+
dbHashAfter, err := computeFileHash(bePath)
107+
require.NoErrorf(t, err, "failed to compute DB file hash after txn")
108+
require.Equalf(t, dbHashBefore, dbHashAfter, "mismatch in DB hash before and after failed write txn")
109+
}
110+
111+
func computeFileHash(filePath string) (string, error) {
112+
file, err := os.Open(filePath)
113+
if err != nil {
114+
return "", err
115+
}
116+
defer file.Close()
117+
118+
h := sha256.New()
119+
if _, err := io.Copy(h, file); err != nil {
120+
return "", err
121+
}
122+
return string(h.Sum(nil)), nil
96123
}

0 commit comments

Comments
 (0)