Skip to content

Commit 67e56a9

Browse files
authored
For KeyIsLocked error reported for timeout, if a lock is recently updated, don't try to resolve it. (#758)
* update client-go; format Signed-off-by: ekexium <[email protected]> * feat: do not resolve lock if duration_to_last_updated is short Signed-off-by: ekexium <[email protected]> * adjust the threshold to 1200ms to allow small deviation Signed-off-by: ekexium <[email protected]> * fix: don't treat it as WriteConflict, simply retry Signed-off-by: ekexium <[email protected]> * update kvproto Signed-off-by: ekexium <[email protected]> * set the threshold to 300ms Signed-off-by: ekexium <[email protected]> --------- Signed-off-by: ekexium <[email protected]>
1 parent f3e8703 commit 67e56a9

File tree

1 file changed

+110
-47
lines changed

1 file changed

+110
-47
lines changed

Diff for: txnkv/transaction/pessimistic.go

+110-47
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,9 @@ type diagnosticContext struct {
100100
reqDuration time.Duration
101101
}
102102

103-
func (action actionPessimisticLock) handleSingleBatch(c *twoPhaseCommitter, bo *retry.Backoffer, batch batchMutations) error {
103+
func (action actionPessimisticLock) handleSingleBatch(
104+
c *twoPhaseCommitter, bo *retry.Backoffer, batch batchMutations,
105+
) error {
104106
convertMutationsToPb := func(committerMutations CommitterMutations) []*kvrpcpb.Mutation {
105107
mutations := make([]*kvrpcpb.Mutation, committerMutations.Len())
106108
c.txn.GetMemBuffer().RLock()
@@ -120,26 +122,28 @@ func (action actionPessimisticLock) handleSingleBatch(c *twoPhaseCommitter, bo *
120122

121123
m := batch.mutations
122124
mutations := convertMutationsToPb(m)
123-
req := tikvrpc.NewRequest(tikvrpc.CmdPessimisticLock, &kvrpcpb.PessimisticLockRequest{
124-
Mutations: mutations,
125-
PrimaryLock: c.primary(),
126-
StartVersion: c.startTS,
127-
ForUpdateTs: c.forUpdateTS,
128-
IsFirstLock: c.isFirstLock,
129-
WaitTimeout: action.LockWaitTime(),
130-
ReturnValues: action.ReturnValues,
131-
CheckExistence: action.CheckExistence,
132-
MinCommitTs: c.forUpdateTS + 1,
133-
WakeUpMode: action.wakeUpMode,
134-
LockOnlyIfExists: action.LockOnlyIfExists,
135-
}, kvrpcpb.Context{
136-
Priority: c.priority,
137-
SyncLog: c.syncLog,
138-
ResourceGroupTag: action.LockCtx.ResourceGroupTag,
139-
MaxExecutionDurationMs: uint64(client.MaxWriteExecutionTime.Milliseconds()),
140-
RequestSource: c.txn.GetRequestSource(),
141-
ResourceGroupName: c.resourceGroupName,
142-
})
125+
req := tikvrpc.NewRequest(
126+
tikvrpc.CmdPessimisticLock, &kvrpcpb.PessimisticLockRequest{
127+
Mutations: mutations,
128+
PrimaryLock: c.primary(),
129+
StartVersion: c.startTS,
130+
ForUpdateTs: c.forUpdateTS,
131+
IsFirstLock: c.isFirstLock,
132+
WaitTimeout: action.LockWaitTime(),
133+
ReturnValues: action.ReturnValues,
134+
CheckExistence: action.CheckExistence,
135+
MinCommitTs: c.forUpdateTS + 1,
136+
WakeUpMode: action.wakeUpMode,
137+
LockOnlyIfExists: action.LockOnlyIfExists,
138+
}, kvrpcpb.Context{
139+
Priority: c.priority,
140+
SyncLog: c.syncLog,
141+
ResourceGroupTag: action.LockCtx.ResourceGroupTag,
142+
MaxExecutionDurationMs: uint64(client.MaxWriteExecutionTime.Milliseconds()),
143+
RequestSource: c.txn.GetRequestSource(),
144+
ResourceGroupName: c.resourceGroupName,
145+
},
146+
)
143147
if action.LockCtx.ResourceGroupTag == nil && action.LockCtx.ResourceGroupTagger != nil {
144148
req.ResourceGroupTag = action.LockCtx.ResourceGroupTagger(req.Req.(*kvrpcpb.PessimisticLockRequest))
145149
}
@@ -168,8 +172,10 @@ func (action actionPessimisticLock) handleSingleBatch(c *twoPhaseCommitter, bo *
168172
for _, m := range mutations {
169173
keys = append(keys, hex.EncodeToString(m.Key))
170174
}
171-
logutil.BgLogger().Info("[failpoint] injected lock ttl = 1 on pessimistic lock",
172-
zap.Uint64("txnStartTS", c.startTS), zap.Strings("keys", keys))
175+
logutil.BgLogger().Info(
176+
"[failpoint] injected lock ttl = 1 on pessimistic lock",
177+
zap.Uint64("txnStartTS", c.startTS), zap.Strings("keys", keys),
178+
)
173179
}
174180
req.PessimisticLock().LockTtl = ttl
175181
if _, err := util.EvalFailpoint("PessimisticLockErrWriteConflict"); err == nil {
@@ -221,7 +227,9 @@ func (action actionPessimisticLock) handleSingleBatch(c *twoPhaseCommitter, bo *
221227
}
222228
}
223229

224-
func (action actionPessimisticLock) handleRegionError(c *twoPhaseCommitter, bo *retry.Backoffer, batch *batchMutations, regionErr *errorpb.Error) (finished bool, err error) {
230+
func (action actionPessimisticLock) handleRegionError(
231+
c *twoPhaseCommitter, bo *retry.Backoffer, batch *batchMutations, regionErr *errorpb.Error,
232+
) (finished bool, err error) {
225233
// For other region error and the fake region error, backoff because
226234
// there's something wrong.
227235
// For the real EpochNotMatch error, don't backoff.
@@ -242,7 +250,13 @@ func (action actionPessimisticLock) handleRegionError(c *twoPhaseCommitter, bo *
242250
return true, err
243251
}
244252

245-
func (action actionPessimisticLock) handleKeyError(c *twoPhaseCommitter, keyErrs []*kvrpcpb.KeyError) (locks []*txnlock.Lock, finished bool, err error) {
253+
// When handling wait timeout, if the current lock is updated within the threshold, do not try to resolve lock
254+
// The default timeout in TiKV is 1 second. 300ms should be appropriate for common hot update workloads.
255+
const skipResolveThresholdMs = 300
256+
257+
func (action actionPessimisticLock) handleKeyErrorForResolve(
258+
c *twoPhaseCommitter, keyErrs []*kvrpcpb.KeyError,
259+
) (locks []*txnlock.Lock, finished bool, err error) {
246260
for _, keyErr := range keyErrs {
247261
// Check already exists error
248262
if alreadyExist := keyErr.GetAlreadyExist(); alreadyExist != nil {
@@ -253,17 +267,32 @@ func (action actionPessimisticLock) handleKeyError(c *twoPhaseCommitter, keyErrs
253267
return nil, true, errors.WithStack(&tikverr.ErrDeadlock{Deadlock: deadlock})
254268
}
255269

270+
// Do not resolve the lock if the lock was recently updated which indicates the txn holding the lock is
271+
// much likely alive.
272+
// This should only happen for wait timeout.
273+
if lockInfo := keyErr.GetLocked(); lockInfo != nil &&
274+
lockInfo.DurationToLastUpdateMs > 0 &&
275+
lockInfo.DurationToLastUpdateMs < skipResolveThresholdMs {
276+
continue
277+
}
278+
256279
// Extract lock from key error
257280
lock, err1 := txnlock.ExtractLockFromKeyErr(keyErr)
258281
if err1 != nil {
259282
return nil, true, err1
260283
}
261284
locks = append(locks, lock)
262285
}
286+
if len(locks) == 0 {
287+
return nil, false, nil
288+
}
263289
return locks, false, nil
264290
}
265291

266-
func (action actionPessimisticLock) handlePessimisticLockResponseNormalMode(c *twoPhaseCommitter, bo *retry.Backoffer, batch *batchMutations, mutationsPb []*kvrpcpb.Mutation, resp *tikvrpc.Response, diagCtx *diagnosticContext) (finished bool, err error) {
292+
func (action actionPessimisticLock) handlePessimisticLockResponseNormalMode(
293+
c *twoPhaseCommitter, bo *retry.Backoffer, batch *batchMutations, mutationsPb []*kvrpcpb.Mutation,
294+
resp *tikvrpc.Response, diagCtx *diagnosticContext,
295+
) (finished bool, err error) {
267296
regionErr, err := resp.GetRegionError()
268297
if err != nil {
269298
return true, err
@@ -283,7 +312,12 @@ func (action actionPessimisticLock) handlePessimisticLockResponseNormalMode(c *t
283312
if len(keyErrs) == 0 {
284313

285314
if action.LockCtx.Stats != nil {
286-
action.LockCtx.Stats.MergeReqDetails(diagCtx.reqDuration, batch.region.GetID(), diagCtx.sender.GetStoreAddr(), lockResp.ExecDetailsV2)
315+
action.LockCtx.Stats.MergeReqDetails(
316+
diagCtx.reqDuration,
317+
batch.region.GetID(),
318+
diagCtx.sender.GetStoreAddr(),
319+
lockResp.ExecDetailsV2,
320+
)
287321
}
288322

289323
if batch.isPrimary {
@@ -314,10 +348,14 @@ func (action actionPessimisticLock) handlePessimisticLockResponseNormalMode(c *t
314348
}
315349
return true, nil
316350
}
317-
locks, finished, err := action.handleKeyError(c, keyErrs)
351+
352+
locks, finished, err := action.handleKeyErrorForResolve(c, keyErrs)
318353
if err != nil {
319354
return finished, err
320355
}
356+
if len(locks) == 0 {
357+
return false, nil
358+
}
321359

322360
// Because we already waited on tikv, no need to Backoff here.
323361
// tikv default will wait 3s(also the maximum wait value) when lock error occurs
@@ -360,7 +398,10 @@ func (action actionPessimisticLock) handlePessimisticLockResponseNormalMode(c *t
360398
return false, nil
361399
}
362400

363-
func (action actionPessimisticLock) handlePessimisticLockResponseForceLockMode(c *twoPhaseCommitter, bo *retry.Backoffer, batch *batchMutations, mutationsPb []*kvrpcpb.Mutation, resp *tikvrpc.Response, diagCtx *diagnosticContext) (finished bool, err error) {
401+
func (action actionPessimisticLock) handlePessimisticLockResponseForceLockMode(
402+
c *twoPhaseCommitter, bo *retry.Backoffer, batch *batchMutations, mutationsPb []*kvrpcpb.Mutation,
403+
resp *tikvrpc.Response, diagCtx *diagnosticContext,
404+
) (finished bool, err error) {
364405
regionErr, err := resp.GetRegionError()
365406
if err != nil {
366407
return true, err
@@ -376,7 +417,9 @@ func (action actionPessimisticLock) handlePessimisticLockResponseForceLockMode(c
376417
if len(mutationsPb) > 1 || len(lockResp.Results) > 1 {
377418
panic("unreachable")
378419
}
379-
if batch.isPrimary && len(lockResp.Results) > 0 && lockResp.Results[0].Type != kvrpcpb.PessimisticLockKeyResultType_LockResultFailed {
420+
if batch.isPrimary &&
421+
len(lockResp.Results) > 0 &&
422+
lockResp.Results[0].Type != kvrpcpb.PessimisticLockKeyResultType_LockResultFailed {
380423
// After locking the primary key, we should protect the primary lock from expiring.
381424
c.run(c, action.LockCtx)
382425
}
@@ -422,11 +465,16 @@ func (action actionPessimisticLock) handlePessimisticLockResponseForceLockMode(c
422465

423466
if len(lockResp.Results) > 0 && !isMutationFailed {
424467
if action.LockCtx.Stats != nil {
425-
action.LockCtx.Stats.MergeReqDetails(diagCtx.reqDuration, batch.region.GetID(), diagCtx.sender.GetStoreAddr(), lockResp.ExecDetailsV2)
468+
action.LockCtx.Stats.MergeReqDetails(
469+
diagCtx.reqDuration,
470+
batch.region.GetID(),
471+
diagCtx.sender.GetStoreAddr(),
472+
lockResp.ExecDetailsV2,
473+
)
426474
}
427475
}
428476

429-
locks, finished, err := action.handleKeyError(c, keyErrs)
477+
locks, finished, err := action.handleKeyErrorForResolve(c, keyErrs)
430478
if err != nil {
431479
return finished, err
432480
}
@@ -477,9 +525,9 @@ func (action actionPessimisticLock) handlePessimisticLockResponseForceLockMode(c
477525
return false, nil
478526
}
479527

480-
// If the failedMutations is not empty and the error is not KeyIsLocked, the function should have already
481-
// returned before. So this is an unreachable path.
482-
return true, errors.New("Pessimistic lock response corrupted")
528+
// This can be the situation where KeyIsLocked errors are generated by timeout,
529+
// and we decide not to resolve them. Instead, just retry
530+
return false, nil
483531
}
484532

485533
if len(locks) != 0 {
@@ -497,16 +545,20 @@ func (action actionPessimisticLock) handlePessimisticLockResponseForceLockMode(c
497545
return true, nil
498546
}
499547

500-
func (actionPessimisticRollback) handleSingleBatch(c *twoPhaseCommitter, bo *retry.Backoffer, batch batchMutations) error {
548+
func (actionPessimisticRollback) handleSingleBatch(
549+
c *twoPhaseCommitter, bo *retry.Backoffer, batch batchMutations,
550+
) error {
501551
forUpdateTS := c.forUpdateTS
502552
if c.maxLockedWithConflictTS > forUpdateTS {
503553
forUpdateTS = c.maxLockedWithConflictTS
504554
}
505-
req := tikvrpc.NewRequest(tikvrpc.CmdPessimisticRollback, &kvrpcpb.PessimisticRollbackRequest{
506-
StartVersion: c.startTS,
507-
ForUpdateTs: forUpdateTS,
508-
Keys: batch.mutations.GetKeys(),
509-
})
555+
req := tikvrpc.NewRequest(
556+
tikvrpc.CmdPessimisticRollback, &kvrpcpb.PessimisticRollbackRequest{
557+
StartVersion: c.startTS,
558+
ForUpdateTs: forUpdateTS,
559+
Keys: batch.mutations.GetKeys(),
560+
},
561+
)
510562
req.RequestSource = util.RequestSourceFromCtx(bo.GetCtx())
511563
req.MaxExecutionDurationMs = uint64(client.MaxWriteExecutionTime.Milliseconds())
512564
resp, err := c.store.SendReq(bo, req, batch.region, client.ReadTimeoutShort)
@@ -528,7 +580,10 @@ func (actionPessimisticRollback) handleSingleBatch(c *twoPhaseCommitter, bo *ret
528580
return nil
529581
}
530582

531-
func (c *twoPhaseCommitter) pessimisticLockMutations(bo *retry.Backoffer, lockCtx *kv.LockCtx, lockWaitMode kvrpcpb.PessimisticLockWakeUpMode, mutations CommitterMutations) error {
583+
func (c *twoPhaseCommitter) pessimisticLockMutations(
584+
bo *retry.Backoffer, lockCtx *kv.LockCtx, lockWaitMode kvrpcpb.PessimisticLockWakeUpMode,
585+
mutations CommitterMutations,
586+
) error {
532587
if c.sessionID > 0 {
533588
if val, err := util.EvalFailpoint("beforePessimisticLock"); err == nil {
534589
// Pass multiple instructions in one string, delimited by commas, to trigger multiple behaviors, like
@@ -537,19 +592,27 @@ func (c *twoPhaseCommitter) pessimisticLockMutations(bo *retry.Backoffer, lockCt
537592
for _, action := range strings.Split(v, ",") {
538593
if action == "delay" {
539594
duration := time.Duration(rand.Int63n(int64(time.Second) * 5))
540-
logutil.Logger(bo.GetCtx()).Info("[failpoint] injected delay at pessimistic lock",
541-
zap.Uint64("txnStartTS", c.startTS), zap.Duration("duration", duration))
595+
logutil.Logger(bo.GetCtx()).Info(
596+
"[failpoint] injected delay at pessimistic lock",
597+
zap.Uint64("txnStartTS", c.startTS), zap.Duration("duration", duration),
598+
)
542599
time.Sleep(duration)
543600
} else if action == "fail" {
544-
logutil.Logger(bo.GetCtx()).Info("[failpoint] injected failure at pessimistic lock",
545-
zap.Uint64("txnStartTS", c.startTS))
601+
logutil.Logger(bo.GetCtx()).Info(
602+
"[failpoint] injected failure at pessimistic lock",
603+
zap.Uint64("txnStartTS", c.startTS),
604+
)
546605
return errors.New("injected failure at pessimistic lock")
547606
}
548607
}
549608
}
550609
}
551610
}
552-
return c.doActionOnMutations(bo, actionPessimisticLock{LockCtx: lockCtx, wakeUpMode: lockWaitMode, isInternal: c.txn.isInternal()}, mutations)
611+
return c.doActionOnMutations(
612+
bo,
613+
actionPessimisticLock{LockCtx: lockCtx, wakeUpMode: lockWaitMode, isInternal: c.txn.isInternal()},
614+
mutations,
615+
)
553616
}
554617

555618
func (c *twoPhaseCommitter) pessimisticRollbackMutations(bo *retry.Backoffer, mutations CommitterMutations) error {

0 commit comments

Comments
 (0)