This commit is contained in:
Yun-Sheng Chang 2025-03-15 22:11:08 -04:00
parent fe6ae451c1
commit ea04b4c78f
7 changed files with 55 additions and 70 deletions

View File

@ -68,7 +68,9 @@ func (ts *Test) onePartition(p []int, req any) any {
// try all the servers, maybe one is the leader but give up after NSEC // try all the servers, maybe one is the leader but give up after NSEC
t0 := time.Now() t0 := time.Now()
for time.Since(t0).Seconds() < NSEC { for time.Since(t0).Seconds() < NSEC {
ts.mu.Lock()
index := ts.leader index := ts.leader
ts.mu.Unlock()
for range ts.srvs { for range ts.srvs {
if ts.g.IsConnected(index) { if ts.g.IsConnected(index) {
s := ts.srvs[index] s := ts.srvs[index]

View File

@ -878,10 +878,11 @@ func TestPersist33C(t *testing.T) {
ts.g.ShutdownServer((leader + 0) % servers) ts.g.ShutdownServer((leader + 0) % servers)
ts.g.ShutdownServer((leader + 1) % servers) ts.g.ShutdownServer((leader + 1) % servers)
tester.AnnotateShutdown([]int{(leader + 0) % servers, (leader + 1) % servers}) tester.AnnotateShutdown([]int{(leader + 0) % servers, (leader + 1) % servers})
ts.restart((leader + 2) % servers) ts.g.ConnectOne((leader + 2) % servers)
ts.restart((leader + 0) % servers)
tester.AnnotateRestart([]int{(leader + 2) % servers, (leader + 0) % servers})
tester.AnnotateConnection(ts.g.GetConnected()) tester.AnnotateConnection(ts.g.GetConnected())
ts.restart((leader + 0) % servers)
tester.AnnotateRestart([]int{(leader + 0) % servers})
ts.one(103, 2, true) ts.one(103, 2, true)

View File

@ -37,12 +37,14 @@ func MakeShardCtrler(clnt *tester.Clnt, leases bool) *ShardCtrler {
} }
// The tester calls InitController() before starting a new // The tester calls InitController() before starting a new
// controller. In this method you can implement recovery (part B) and // controller. In part A, this method doesn't need to do anything. In
// use a lock to become leader (part C). InitController may fail when // B and C, this method implements recovery (part B) and uses a lock
// another controller supersedes (e.g., when this controller is // to become leader (part C). InitController should return
// partitioned during recovery). // rpc.ErrVersion when another controller supersedes it (e.g., when
// this controller is partitioned during recovery); this happens only
// in Part C. Otherwise, it returns rpc.OK.
func (sck *ShardCtrler) InitController() rpc.Err { func (sck *ShardCtrler) InitController() rpc.Err {
return rpc.ErrNoKey return rpc.ErrVersion
} }
// The tester calls ExitController to exit a controller. In part B and // The tester calls ExitController to exit a controller. In part B and
@ -59,9 +61,10 @@ func (sck *ShardCtrler) InitConfig(cfg *shardcfg.ShardConfig) {
} }
// Called by the tester to ask the controller to change the // Called by the tester to ask the controller to change the
// configuration from the current one to new. It may return an error // configuration from the current one to new. It should return
// if this controller is disconnected for a while and another // rpc.ErrVersion if this controller is superseded by another
// controller takes over in the mean time, as in part C. // controller, as in part C. In all other cases, it should return
// rpc.OK.
func (sck *ShardCtrler) ChangeConfigTo(new *shardcfg.ShardConfig) rpc.Err { func (sck *ShardCtrler) ChangeConfigTo(new *shardcfg.ShardConfig) rpc.Err {
return rpc.OK return rpc.OK
} }
@ -79,7 +82,7 @@ func (sck *ShardCtrler) isKilled() bool {
} }
// Return the current configuration // Return the current configuration and its version number
func (sck *ShardCtrler) Query() (*shardcfg.ShardConfig, rpc.Tversion) { func (sck *ShardCtrler) Query() (*shardcfg.ShardConfig, rpc.Tversion) {
// Your code here. // Your code here.
return nil, 0 return nil, 0

View File

@ -104,15 +104,13 @@ func TestJoinBasic5A(t *testing.T) {
ts.t.Fatalf("%d isn't a member of %v", gid2, cfg1) ts.t.Fatalf("%d isn't a member of %v", gid2, cfg1)
} }
// check shards at shardcfg.Gid2 ts.checkShutdownSharding(gid1, ka, va)
ts.checkShutdownSharding(gid1, gid2, ka, va)
for i := 0; i < len(ka); i++ { for i := 0; i < len(ka); i++ {
ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1)) ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
} }
// check shards at shardcfg.Gid1 ts.checkShutdownSharding(gid2, ka, va)
ts.checkShutdownSharding(gid2, gid1, ka, va)
for i := 0; i < len(ka); i++ { for i := 0; i < len(ka); i++ {
ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1)) ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
@ -172,8 +170,7 @@ func TestJoinLeaveBasic5A(t *testing.T) {
ts.t.Fatalf("joinGroups: err %v", err) ts.t.Fatalf("joinGroups: err %v", err)
} }
// check shards at shardcfg.Gid2 ts.checkShutdownSharding(gid1, ka, va)
ts.checkShutdownSharding(gid1, gid2, ka, va)
for i := 0; i < len(ka); i++ { for i := 0; i < len(ka); i++ {
ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1)) ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
@ -204,8 +201,7 @@ func TestJoinLeaveBasic5A(t *testing.T) {
ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1)) ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
} }
// check shards at shardcfg.Gid2 ts.checkShutdownSharding(gid2, ka, va)
ts.checkShutdownSharding(gid2, gid1, ka, va)
} }
// test many groups joining and leaving, reliable or unreliable // test many groups joining and leaving, reliable or unreliable
@ -222,7 +218,7 @@ func joinLeave5A(t *testing.T, reliable bool, part string) {
ts.joinGroups(sck, grps) ts.joinGroups(sck, grps)
ts.checkShutdownSharding(grps[0], grps[1], ka, va) ts.checkShutdownSharding(grps[0], ka, va)
for i := 0; i < len(ka); i++ { for i := 0; i < len(ka); i++ {
ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1)) ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
@ -260,7 +256,7 @@ func TestShutdown5A(t *testing.T) {
grps := ts.groups(NJOIN) grps := ts.groups(NJOIN)
ts.joinGroups(sck, grps) ts.joinGroups(sck, grps)
ts.checkShutdownSharding(grps[0], grps[1], ka, va) ts.checkShutdownSharding(grps[0], ka, va)
for i := 0; i < len(ka); i++ { for i := 0; i < len(ka); i++ {
ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1)) ts.CheckGet(ck, ka[i], va[i], rpc.Tversion(1))
@ -569,13 +565,13 @@ func TestJoinLeave5B(t *testing.T) {
} }
} }
// test recovery of partitioned controlers // test recovery of partitioned controllers
func TestRecoverCtrler5B(t *testing.T) { func TestRecoverCtrler5B(t *testing.T) {
const ( const (
NPARTITION = 5 NPARTITION = 5
) )
ts := MakeTest(t, "Test (5B): recover controler ...", true) ts := MakeTest(t, "Test (5B): recover controller ...", true)
defer ts.Cleanup() defer ts.Cleanup()
gid := ts.setupKVService() gid := ts.setupKVService()
@ -733,7 +729,7 @@ func TestLeaseBasicRefresh5C(t *testing.T) {
// Test if old leader is fenced off when reconnecting while it is in // Test if old leader is fenced off when reconnecting while it is in
// the middle of a Join. // the middle of a Join.
func TestPartitionControlerJoin5C(t *testing.T) { func TestPartitionControllerJoin5C(t *testing.T) {
const ( const (
NSLEEP = 2 NSLEEP = 2
RAND = 1000 RAND = 1000
@ -759,8 +755,8 @@ func TestPartitionControlerJoin5C(t *testing.T) {
ch <- ts.join(sck, ngid, ts.Group(ngid).SrvNames()) ch <- ts.join(sck, ngid, ts.Group(ngid).SrvNames())
}() }()
// sleep for a while to get the chance for the controler to get stuck // sleep for a while to get the chance for the controller to get
// in join or leave, because gid is down // stuck in join, because gid is down
time.Sleep(1 * time.Second) time.Sleep(1 * time.Second)
// partition sck // partition sck
@ -771,19 +767,20 @@ func TestPartitionControlerJoin5C(t *testing.T) {
ts.Group(ngid).StartServers() ts.Group(ngid).StartServers()
// start new controler to supersede partitioned one, // start new controller to supersede partitioned one,
// it will also be stuck
sck0 := ts.makeShardCtrler() sck0 := ts.makeShardCtrler()
if err := sck0.InitController(); err != rpc.OK { if err := sck0.InitController(); err != rpc.OK {
t.Fatalf("failed to init controller %v", err) t.Fatalf("failed to init controller %v", err)
} }
scfg, _ := sck0.Query()
if !scfg.IsMember(ngid) {
t.Fatalf("Didn't recover gid %d", ngid)
}
sck0.ExitController() sck0.ExitController()
//log.Printf("reconnect") // reconnect old controller, which shouldn't finish ChangeConfigTo
// reconnect old controller, which shouldn't be able
// to do anything
clnt.ConnectAll() clnt.ConnectAll()
err := <-ch err := <-ch

View File

@ -2,7 +2,7 @@ package shardkv
import ( import (
"fmt" "fmt"
//"log" "log"
"math/rand" "math/rand"
"sync" "sync"
"sync/atomic" "sync/atomic"
@ -131,22 +131,19 @@ func (ts *Test) join(sck *shardctrler.ShardCtrler, gid tester.Tgid, srvs []strin
newcfg := cfg.Copy() newcfg := cfg.Copy()
ok := newcfg.JoinBalance(map[tester.Tgid][]string{gid: srvs}) ok := newcfg.JoinBalance(map[tester.Tgid][]string{gid: srvs})
if !ok { if !ok {
return rpc.ErrVersion log.Fatalf("join: group %d is already present", gid)
} }
err := sck.ChangeConfigTo(newcfg) return sck.ChangeConfigTo(newcfg)
return err
} }
func (ts *Test) joinGroups(sck *shardctrler.ShardCtrler, gids []tester.Tgid) rpc.Err { func (ts *Test) joinGroups(sck *shardctrler.ShardCtrler, gids []tester.Tgid) rpc.Err {
for i, gid := range gids { for _, gid := range gids {
ts.Config.MakeGroupStart(gid, NSRV, ts.StartServerShardGrp) ts.Config.MakeGroupStart(gid, NSRV, ts.StartServerShardGrp)
if err := ts.join(sck, gid, ts.Group(gid).SrvNames()); err != rpc.OK { if err := ts.join(sck, gid, ts.Group(gid).SrvNames()); err != rpc.OK {
return err return err
} }
if i < len(gids)-1 {
time.Sleep(INTERGRPDELAY * time.Millisecond) time.Sleep(INTERGRPDELAY * time.Millisecond)
} }
}
return rpc.OK return rpc.OK
} }
@ -156,21 +153,19 @@ func (ts *Test) leave(sck *shardctrler.ShardCtrler, gid tester.Tgid) rpc.Err {
newcfg := cfg.Copy() newcfg := cfg.Copy()
ok := newcfg.LeaveBalance([]tester.Tgid{gid}) ok := newcfg.LeaveBalance([]tester.Tgid{gid})
if !ok { if !ok {
return rpc.ErrVersion log.Fatalf("leave: group %d is already not present", gid)
} }
return sck.ChangeConfigTo(newcfg) return sck.ChangeConfigTo(newcfg)
} }
func (ts *Test) leaveGroups(sck *shardctrler.ShardCtrler, gids []tester.Tgid) rpc.Err { func (ts *Test) leaveGroups(sck *shardctrler.ShardCtrler, gids []tester.Tgid) rpc.Err {
for i, gid := range gids { for _, gid := range gids {
if err := ts.leave(sck, gid); err != rpc.OK { if err := ts.leave(sck, gid); err != rpc.OK {
return err return err
} }
ts.Config.ExitGroup(gid) ts.Config.ExitGroup(gid)
if i < len(gids)-1 {
time.Sleep(INTERGRPDELAY * time.Millisecond) time.Sleep(INTERGRPDELAY * time.Millisecond)
} }
}
return rpc.OK return rpc.OK
} }
@ -196,31 +191,14 @@ func (ts *Test) disconnectClntFromLeader(clnt *tester.Clnt, gid tester.Tgid) int
return l return l
} }
func (ts *Test) checkLogs(gids []tester.Tgid) {
for _, gid := range gids {
n := ts.Group(gid).LogSize()
s := ts.Group(gid).SnapshotSize()
if ts.maxraftstate >= 0 && n > 8*ts.maxraftstate {
ts.t.Fatalf("persister.RaftStateSize() %v, but maxraftstate %v",
n, ts.maxraftstate)
}
if ts.maxraftstate < 0 && s > 0 {
ts.t.Fatalf("maxraftstate is -1, but snapshot is non-empty!")
}
}
}
// make sure that the data really is sharded by // make sure that the data really is sharded by
// shutting down one shard and checking that some // shutting down one shard and checking that some
// Get()s don't succeed. // Get()s don't succeed.
func (ts *Test) checkShutdownSharding(down, up tester.Tgid, ka []string, va []string) { func (ts *Test) checkShutdownSharding(down tester.Tgid, ka []string, va []string) {
const NSEC = 2 const NSEC = 2
ts.Group(down).Shutdown() ts.Group(down).Shutdown()
ts.checkLogs([]tester.Tgid{down, up}) // forbid snapshots
n := len(ka) n := len(ka)
ch := make(chan string) ch := make(chan string)
done := int32(0) done := int32(0)
@ -239,7 +217,6 @@ func (ts *Test) checkShutdownSharding(down, up tester.Tgid, ka []string, va []st
}(xi) }(xi)
} }
// wait a bit, only about half the Gets should succeed.
ndone := 0 ndone := 0
for atomic.LoadInt32(&done) != 1 { for atomic.LoadInt32(&done) != 1 {
select { select {
@ -254,9 +231,9 @@ func (ts *Test) checkShutdownSharding(down, up tester.Tgid, ka []string, va []st
} }
} }
//log.Printf("%d completions out of %d; down %d", ndone, n, down) // log.Printf("%d completions out of %d; down %d", ndone, n, down)
if ndone >= n { if ndone >= n {
ts.Fatalf("expected less than %d completions with one shard dead\n", n) ts.Fatalf("expected less than %d completions with shard %d down\n", n, down)
} }
// bring the crashed shard/group back to life. // bring the crashed shard/group back to life.
@ -360,8 +337,8 @@ func (ts *Test) killCtrler(ck kvtest.IKVClerk, gid tester.Tgid, ka, va []string)
sck0.ExitController() sck0.ExitController()
if ts.leases { if ts.leases {
// reconnect old controller, which shouldn't be able // reconnect old controller, which should bail out, because
// to do anything // it has been superseded.
clnt.ConnectAll() clnt.ConnectAll()
time.Sleep(1 * time.Second) time.Sleep(1 * time.Second)

View File

@ -74,6 +74,7 @@ type ServerGrp struct {
gid Tgid gid Tgid
connected []bool // whether each server is on the net connected []bool // whether each server is on the net
mks FstartServer mks FstartServer
mu sync.Mutex
} }
func makeSrvGrp(net *labrpc.Network, gid Tgid, n int, mks FstartServer) *ServerGrp { func makeSrvGrp(net *labrpc.Network, gid Tgid, n int, mks FstartServer) *ServerGrp {
@ -174,7 +175,9 @@ func (sg *ServerGrp) connect(i int, to []int) {
func (sg *ServerGrp) disconnect(i int, from []int) { func (sg *ServerGrp) disconnect(i int, from []int) {
// log.Printf("%p: disconnect peer %d from %v\n", sg, i, from) // log.Printf("%p: disconnect peer %d from %v\n", sg, i, from)
sg.mu.Lock()
sg.connected[i] = false sg.connected[i] = false
sg.mu.Unlock()
// outgoing socket files // outgoing socket files
sg.srvs[i].disconnect(from) sg.srvs[i].disconnect(from)
@ -195,6 +198,8 @@ func (sg *ServerGrp) DisconnectAll(i int) {
} }
func (sg *ServerGrp) IsConnected(i int) bool { func (sg *ServerGrp) IsConnected(i int) bool {
defer sg.mu.Unlock()
sg.mu.Lock()
return sg.connected[i] return sg.connected[i]
} }

View File

@ -87,7 +87,7 @@ func (s *Server) shutdownServer() {
// inform all services to stop // inform all services to stop
for _, svc := range s.svcs { for _, svc := range s.svcs {
if svc != nil { if svc != nil {
go svc.Kill() svc.Kill()
} }
} }
s.svcs = nil s.svcs = nil