Skip to content

shardctrler.migrate()的非原子性和ChangeConfigTo在Concurrent configuration changes下的问题 #2

@YXZ252426

Description

@YXZ252426

在claimNextOwnership
A 可能因为读取到 Next==new 而继续,B 也可能因为 QueryNext 失败后仍成功执行 UpdateConfig(new, "Next") 而继续。导致多个ctrler获取ownership

// clainNextOwnership tries to claim ownership of the "Next" configuration
func (sck *ShardCtrler) claimNextOwnership(old, new *shardcfg.ShardConfig) bool {
	nextStr := new.String()

	for {
		nextValue, nextCfg := sck.QueryNext()
		if nextCfg != nil && nextCfg.Num > old.Num {
			return new.Num == nextCfg.Num && nextValue == nextStr
		}
		err := sck.UpdateConfig(new, "Next")
		if err == rpc.OK {
			return true
		}

在migrate
当前 migrate() 不是原子迁移;FreezeShard 成功后失败退出会留下不可自动恢复的 frozen shard

	for {
		if sck.hasConfigApplied(new.Num) {
			return false
		}
		state, err = srcClerk.FreezeShard(move.Shard, new.Num)
		if err == rpc.OK {
			break
		}
		if err == rpc.ErrWrongGroup || err == rpc.ErrStaleNum {
			return false
		}
	}
	for {
		if sck.hasConfigApplied(new.Num) {
			return false
		}
		err = dstClerk.InstallShard(move.Shard, state, new.Num)
		if err == rpc.OK {
			break
		}
		if err == rpc.ErrWrongGroup || err == rpc.ErrStaleNum {
			return false
		}
	}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions