diff --git a/src/kvraft1/kvraft_test.go b/src/kvraft1/kvraft_test.go index 207d7c4..c9cdbcf 100644 --- a/src/kvraft1/kvraft_test.go +++ b/src/kvraft1/kvraft_test.go @@ -5,14 +5,15 @@ import ( "strconv" "testing" "time" + "fmt" "6.5840/kvraft1/rsm" "6.5840/kvsrv1/rpc" "6.5840/kvtest1" + tester "6.5840/tester1" ) const ( - NSEC = 1 NCLNT = 10 ) @@ -29,6 +30,7 @@ const ( func (ts *Test) GenericTest() { const ( NITER = 3 + NSEC = 1 T = NSEC * time.Second NKEYS = 100 ) @@ -64,17 +66,21 @@ func (ts *Test) GenericTest() { <-ch_spawn // wait for clients to be done + if i == NITER-1 { + tester.SetAnnotationFinalized() + } ts.CheckPorcupine() if ts.partitions { ch_partitioner <- true - // log.Printf("wait for partitioner\n") + //log.Printf("wait for partitioner\n") <-ch_partitioner // reconnect network and submit a request. A client may // have submitted a request in a minority. That request // won't return until that server discovers a new term // has started. ts.Group(Gid).ConnectAll() + tester.AnnotateClearFailure() // wait for a while so that we have a new term time.Sleep(kvtest.ElectionTimeout) } @@ -84,6 +90,7 @@ func (ts *Test) GenericTest() { for i := 0; i < ts.nservers; i++ { ts.Group(Gid).ShutdownServer(i) } + tester.AnnotateShutdownAll() // Wait for a while for servers to shutdown, since // shutdown isn't a real crash and isn't instantaneous time.Sleep(kvtest.ElectionTimeout) @@ -93,6 +100,7 @@ func (ts *Test) GenericTest() { ts.Group(Gid).StartServer(i) } ts.Group(Gid).ConnectAll() + tester.AnnotateClearFailure() } if ts.maxraftstate > 0 { @@ -100,14 +108,18 @@ func (ts *Test) GenericTest() { // requests and had time to checkpoint. sz := ts.Config.Group(Gid).LogSize() if sz > 8*ts.maxraftstate { - ts.t.Fatalf("logs were not trimmed (%v > 8*%v)", sz, ts.maxraftstate) + err := fmt.Sprintf("logs were not trimmed (%v > 8*%v)", sz, ts.maxraftstate) + tester.AnnotateCheckerFailure(err, err) + ts.t.Fatalf(err) } } if ts.maxraftstate < 0 { // Check that snapshots are not used ssz := ts.Group(Gid).SnapshotSize() if ssz > 0 { - ts.t.Fatalf("snapshot too large (%v), should not be used when maxraftstate = %d", ssz, ts.maxraftstate) + err := fmt.Sprintf("snapshot too large (%v), should not be used when maxraftstate = %d", ssz, ts.maxraftstate) + tester.AnnotateCheckerFailure(err, err) + ts.t.Fatalf(err) } } } @@ -150,21 +162,25 @@ func (ts *Test) GenericTestSpeed() { func TestBasic4B(t *testing.T) { ts := MakeTest(t, "4B basic", 1, 5, true, false, false, -1, false) + tester.AnnotateTest("TestBasic4B", ts.nservers) ts.GenericTest() } func TestSpeed4B(t *testing.T) { ts := MakeTest(t, "4B speed", 1, 3, true, false, false, -1, false) + tester.AnnotateTest("TestSpeed4B", ts.nservers) ts.GenericTestSpeed() } func TestConcurrent4B(t *testing.T) { ts := MakeTest(t, "4B many clients", 5, 5, true, false, false, -1, false) + tester.AnnotateTest("TestConcurrent4B", ts.nservers) ts.GenericTest() } func TestUnreliable4B(t *testing.T) { ts := MakeTest(t, "4B unreliable net, many clients", 5, 5, false, false, false, -1, false) + tester.AnnotateTest("TestUnreliable4B", ts.nservers) ts.GenericTest() } @@ -175,19 +191,29 @@ func TestOnePartition4B(t *testing.T) { ts := MakeTest(t, "4B progress in majority", 0, 5, false, false, false, -1, false) defer ts.Cleanup() + tester.AnnotateTest("TestOnePartition4B", ts.nservers) + ck := ts.MakeClerk() ver0 := ts.PutAtLeastOnce(ck, "1", "13", rpc.Tversion(0), -1) - _, l := rsm.Leader(ts.Config, Gid) + foundl, l := rsm.Leader(ts.Config, Gid) + if foundl { + text := fmt.Sprintf("leader found = %v", l) + tester.AnnotateInfo(text, text) + } else { + text := "did not find a leader" + tester.AnnotateInfo(text, text) + } p1, p2 := ts.Group(Gid).MakePartition(l) ts.Group(Gid).Partition(p1, p2) + tester.AnnotateTwoPartitions(p1, p2) ckp1 := ts.MakeClerkTo(p1) // connect ckp1 to p1 ckp2a := ts.MakeClerkTo(p2) // connect ckp2a to p2 ckp2b := ts.MakeClerkTo(p2) // connect ckp2b to p2 - ver1 := ts.PutAtLeastOnce(ckp1, "1", "14", ver0+1, -1) + ver1 := ts.PutAtLeastOnce(ckp1, "1", "14", ver0, -1) ts.CheckGet(ckp1, "1", "14", ver1) ts.End() @@ -196,8 +222,9 @@ func TestOnePartition4B(t *testing.T) { done1 := make(chan rpc.Tversion) ts.Begin("Test: no progress in minority (4B)") + tester.AnnotateCheckerBegin(fmt.Sprintf("submit Put(1, 15) and Get(1) to %v", p2)) go func() { - ver := ts.PutAtLeastOnce(ckp2a, "1", "15", ver1+1, -1) + ver := ts.PutAtLeastOnce(ckp2a, "1", "15", ver1, -1) done0 <- ver }() go func() { @@ -207,14 +234,20 @@ func TestOnePartition4B(t *testing.T) { select { case ver := <-done0: - t.Fatalf("Put in minority completed %v", ver) + err := fmt.Sprintf("Put in minority completed with version = %v", ver) + tester.AnnotateCheckerFailure(err, err) + t.Fatalf(err) case ver := <-done1: - t.Fatalf("Get in minority completed %v", ver) + err := fmt.Sprintf("Get in minority completed with version = %v", ver) + tester.AnnotateCheckerFailure(err, err) + t.Fatalf(err) case <-time.After(time.Second): } + tester.AnnotateCheckerSuccess( + "commands to minority partition not committed after 1 second", "not committed") ts.CheckGet(ckp1, "1", "14", ver1) - ver2 := ts.PutAtLeastOnce(ckp1, "1", "16", ver1+1, -1) + ver2 := ts.PutAtLeastOnce(ckp1, "1", "16", ver1, -1) ts.CheckGet(ckp1, "1", "16", ver2) ts.End() @@ -222,64 +255,82 @@ func TestOnePartition4B(t *testing.T) { ts.Begin("Test: completion after heal (4B)") ts.Group(Gid).ConnectAll() + tester.AnnotateClearFailure() ckp2a.(*kvtest.TestClerk).Clnt.ConnectAll() ckp2b.(*kvtest.TestClerk).Clnt.ConnectAll() time.Sleep(kvtest.ElectionTimeout) + tester.AnnotateCheckerBegin("status of Put(1, 15)") + ver15 := rpc.Tversion(0) select { - case <-done0: + case ver15 = <-done0: case <-time.After(30 * 100 * time.Millisecond): + tester.AnnotateCheckerFailure( + "Put(1, 15) did not complete after partition resolved", "OK") t.Fatalf("Put did not complete") } + tester.AnnotateCheckerSuccess("Put(1, 15) completed after partition resolved", "OK") + tester.AnnotateCheckerBegin("status of Get(1)") select { case <-done1: case <-time.After(30 * 100 * time.Millisecond): + tester.AnnotateCheckerFailure( + "Get(1) did not complete after partition resolved", "OK") t.Fatalf("Get did not complete") default: } + tester.AnnotateCheckerSuccess("Get(1) completed after partition resolved", "OK") - ts.CheckGet(ck, "1", "15", ver2+1) + ts.CheckGet(ck, "1", "15", ver15) } func TestManyPartitionsOneClient4B(t *testing.T) { ts := MakeTest(t, "4B partitions, one client", 1, 5, false, false, true, -1, false) + tester.AnnotateTest("TestManyPartitionsOneClient4B", ts.nservers) ts.GenericTest() } func TestManyPartitionsManyClients4B(t *testing.T) { ts := MakeTest(t, "4B partitions, many clients (4B)", 5, 5, false, false, true, -1, false) + tester.AnnotateTest("TestManyPartitionsManyClients4B", ts.nservers) ts.GenericTest() } func TestPersistOneClient4B(t *testing.T) { ts := MakeTest(t, "4B restarts, one client 4B ", 1, 5, false, true, false, -1, false) + tester.AnnotateTest("TestPersistOneClient4B", ts.nservers) ts.GenericTest() } func TestPersistConcurrent4B(t *testing.T) { ts := MakeTest(t, "4B restarts, many clients", 5, 5, false, true, false, -1, false) + tester.AnnotateTest("TestPersistConcurrent4B", ts.nservers) ts.GenericTest() } func TestPersistConcurrentUnreliable4B(t *testing.T) { ts := MakeTest(t, "4B unreliable net, restarts, many clients ", 5, 5, true, true, false, -1, false) + tester.AnnotateTest("TestPersistConcurrentUnreliable4B", ts.nservers) ts.GenericTest() } func TestPersistPartition4B(t *testing.T) { ts := MakeTest(t, "4B restarts, partitions, many clients", 5, 5, false, true, true, -1, false) + tester.AnnotateTest("TestPersistPartition4B", ts.nservers) ts.GenericTest() } func TestPersistPartitionUnreliable4B(t *testing.T) { ts := MakeTest(t, "4B unreliable net, restarts, partitions, many clients", 5, 5, true, true, true, -1, false) + tester.AnnotateTest("TestPersistPartitionUnreliable4B", ts.nservers) ts.GenericTest() } func TestPersistPartitionUnreliableLinearizable4B(t *testing.T) { ts := MakeTest(t, "4B unreliable net, restarts, partitions, random keys, many clients", 15, 7, true, true, true, -1, true) + tester.AnnotateTest("TestPersistPartitionUnreliableLinearizable4B", ts.nservers) ts.GenericTest() } @@ -293,6 +344,7 @@ func TestSnapshotRPC4C(t *testing.T) { MAXRAFTSTATE = 1000 ) ts := MakeTest(t, "4C SnapshotsRPC", 0, NSRV, true, false, false, MAXRAFTSTATE, false) + tester.AnnotateTest("TestSnapshotRPC4C", ts.nservers) defer ts.Cleanup() ck := ts.MakeClerk() @@ -305,6 +357,7 @@ func TestSnapshotRPC4C(t *testing.T) { verb := rpc.Tversion(0) // a bunch of puts into the majority partition. ts.Group(Gid).Partition([]int{0, 1}, []int{2}) + tester.AnnotateTwoPartitions([]int{0, 1}, []int{2}) { ck1 := ts.MakeClerkTo([]int{0, 1}) for i := 0; i < 50; i++ { @@ -318,15 +371,19 @@ func TestSnapshotRPC4C(t *testing.T) { // most of its log entries. sz := ts.Group(Gid).LogSize() if sz > 8*ts.maxraftstate { - t.Fatalf("logs were not trimmed (%v > 8*%v)", sz, ts.maxraftstate) + err := fmt.Sprintf("logs were not trimmed (%v > 8*%v)", sz, ts.maxraftstate) + tester.AnnotateCheckerFailure(err, err) + t.Fatalf(err) } // now make group that requires participation of // lagging server, so that it has to catch up. + verc := rpc.Tversion(0) ts.Group(Gid).Partition([]int{0, 2}, []int{1}) + tester.AnnotateTwoPartitions([]int{0, 2}, []int{1}) { ck1 := ts.MakeClerkTo([]int{0, 2}) - ts.PutAtLeastOnce(ck1, "c", "C", rpc.Tversion(0), -1) + verc = ts.PutAtLeastOnce(ck1, "c", "C", rpc.Tversion(0), -1) ts.PutAtLeastOnce(ck1, "d", "D", rpc.Tversion(0), -1) ts.CheckGet(ck1, "a", "A", vera) ts.CheckGet(ck1, "b", "B", verb) @@ -336,9 +393,10 @@ func TestSnapshotRPC4C(t *testing.T) { // now everybody ts.Group(Gid).Partition([]int{0, 1, 2}, []int{}) + tester.AnnotateClearFailure() vere := ts.PutAtLeastOnce(ck, "e", "E", rpc.Tversion(0), -1) - ts.CheckGet(ck, "c", "C", 1) + ts.CheckGet(ck, "c", "C", verc) ts.CheckGet(ck, "e", "E", vere) ts.CheckGet(ck, "1", "1", rpc.Tversion(1)) } @@ -347,6 +405,7 @@ func TestSnapshotRPC4C(t *testing.T) { // operations we're doing here. func TestSnapshotSize4C(t *testing.T) { ts := MakeTest(t, "4C snapshot size is reasonable", 0, 3, true, false, false, 1000, false) + tester.AnnotateTest("TestSnapshotSize4C", ts.nservers) defer ts.Cleanup() maxsnapshotstate := 500 @@ -357,55 +416,65 @@ func TestSnapshotSize4C(t *testing.T) { for i := 0; i < 200; i++ { ver = ts.PutAtLeastOnce(ck, "x", "0", ver, -1) ts.CheckGet(ck, "x", "0", ver) - ver = ts.PutAtLeastOnce(ck, "x", "1", ver+1, -1) + ver = ts.PutAtLeastOnce(ck, "x", "1", ver, -1) ts.CheckGet(ck, "x", "1", ver) - ver += 1 } // check that servers have thrown away most of their log entries sz := ts.Group(Gid).LogSize() if sz > 8*ts.maxraftstate { - t.Fatalf("logs were not trimmed (%v > 8*%v)", sz, ts.maxraftstate) + err := fmt.Sprintf("logs were not trimmed (%v > 8*%v)", sz, ts.maxraftstate) + tester.AnnotateCheckerFailure(err, err) + t.Fatalf(err) } // check that the snapshots are not unreasonably large ssz := ts.Group(Gid).SnapshotSize() if ssz > maxsnapshotstate { - t.Fatalf("snapshot too large (%v > %v)", ssz, maxsnapshotstate) + err := fmt.Sprintf("snapshot too large (%v > %v)", ssz, maxsnapshotstate) + tester.AnnotateCheckerFailure(err, err) + t.Fatalf(err) } } func TestSpeed4C(t *testing.T) { ts := MakeTest(t, "4C speed", 1, 3, true, false, false, 1000, false) + tester.AnnotateTest("TestSpeed4C", ts.nservers) ts.GenericTestSpeed() } func TestSnapshotRecover4C(t *testing.T) { ts := MakeTest(t, "4C restarts, snapshots, one client", 1, 5, true, true, false, 1000, false) + tester.AnnotateTest("TestSnapshotRecover4C", ts.nservers) ts.GenericTest() } func TestSnapshotRecoverManyClients4C(t *testing.T) { ts := MakeTest(t, "4C restarts, snapshots, many clients ", 20, 5, true, true, false, 1000, false) + tester.AnnotateTest("TestSnapshotRecoverManyClients4C", ts.nservers) ts.GenericTest() } func TestSnapshotUnreliable4C(t *testing.T) { ts := MakeTest(t, "4C unreliable net, snapshots, many clients", 5, 5, false, false, false, 1000, false) + tester.AnnotateTest("TestSnapshotUnreliable4C", ts.nservers) ts.GenericTest() } func TestSnapshotUnreliableRecover4C(t *testing.T) { ts := MakeTest(t, "4C unreliable net, restarts, snapshots, many clients", 5, 5, false, true, false, 1000, false) + tester.AnnotateTest("TestSnapshotUnreliableRecover4C", ts.nservers) ts.GenericTest() } func TestSnapshotUnreliableRecoverConcurrentPartition4C(t *testing.T) { ts := MakeTest(t, "4C unreliable net, restarts, partitions, snapshots, many clients", 5, 5, false, true, true, 1000, false) + tester.AnnotateTest("TestSnapshotUnreliableRecoverConcurrentPartition4C", ts.nservers) ts.GenericTest() } func TestSnapshotUnreliableRecoverConcurrentPartitionLinearizable4C(t *testing.T) { ts := MakeTest(t, "4C unreliable net, restarts, partitions, snapshots, random keys, many clients", 15, 7, false, true, true, 1000, true) + tester.AnnotateTest("TestSnapshotUnreliableRecoverConcurrentPartitionLinearizable4C", ts.nservers) ts.GenericTest() } diff --git a/src/kvraft1/rsm/rsm_test.go b/src/kvraft1/rsm/rsm_test.go index 51fca2c..0a57adf 100644 --- a/src/kvraft1/rsm/rsm_test.go +++ b/src/kvraft1/rsm/rsm_test.go @@ -2,11 +2,13 @@ package rsm import ( //"log" + "fmt" "sync" "testing" "time" "6.5840/kvsrv1/rpc" + "6.5840/tester1" ) // test that each server executes increments and updates its counter. @@ -15,11 +17,15 @@ func TestBasic4A(t *testing.T) { ts := makeTest(t, -1) defer ts.cleanup() + tester.AnnotateTest("TestBasic4A", NSRV) ts.Begin("Test RSM basic") + for i := 0; i < NINC; i++ { r := ts.oneInc() if r.N != i+1 { - ts.t.Fatalf("expected %d instead of %d", i, r.N) + err := fmt.Sprintf("expected %d instead of %d", i, r.N) + tester.AnnotateCheckerFailure(err, err) + ts.t.Fatalf(err) } ts.checkCounter(r.N, NSRV) } @@ -31,6 +37,7 @@ func TestConcurrent4A(t *testing.T) { ts := makeTest(t, -1) defer ts.cleanup() + tester.AnnotateTest("TestConcurrent4A", NSRV) ts.Begin("Test concurrent submit") var wg sync.WaitGroup @@ -51,16 +58,19 @@ func TestLeaderFailure4A(t *testing.T) { ts := makeTest(t, -1) defer ts.cleanup() + tester.AnnotateTest("TestLeaderFailure4A", NSRV) ts.Begin("Test Leader Failure") r := ts.oneInc() ts.checkCounter(r.N, NSRV) l := ts.disconnectLeader() + tester.AnnotateConnection(ts.g.GetConnected()) r = ts.oneInc() ts.checkCounter(r.N, NSRV-1) ts.connect(l) + tester.AnnotateConnection(ts.g.GetConnected()) ts.checkCounter(r.N, NSRV) } @@ -72,6 +82,7 @@ func TestLeaderPartition4A(t *testing.T) { ts := makeTest(t, -1) defer ts.cleanup() + tester.AnnotateTest("TestLeaderPartition4A", NSRV) ts.Begin("Test Leader Partition") // submit an Inc @@ -79,10 +90,20 @@ func TestLeaderPartition4A(t *testing.T) { ts.checkCounter(r.N, NSRV) // partition leader - _, l := Leader(ts.Config, Gid) + foundl, l := Leader(ts.Config, Gid) + if foundl { + text := fmt.Sprintf("leader found = %v", l) + tester.AnnotateInfo(text, text) + } else { + text := "did not find a leader" + tester.AnnotateInfo(text, text) + } p1, p2 := ts.Group(Gid).MakePartition(l) ts.Group(Gid).Partition(p1, p2) + tester.AnnotateTwoPartitions(p1, p2) + text := fmt.Sprintf("concurrently submitting %v Dec to %v", NSUBMIT, l) + tester.AnnotateInfo(text, text) done := make(chan struct{}) go func() { // Submit many Dec's concurrently, which will results in many @@ -109,17 +130,24 @@ func TestLeaderPartition4A(t *testing.T) { select { case err := <-done: - ts.Fatalf("Dec's in minority completed %v", err) + text := fmt.Sprintf("Dec's in minority completed; Submit returns %v", err) + tester.AnnotateCheckerFailure(text, text) + ts.Fatalf(text) case <-time.After(time.Second): } // reconnect leader ts.connect(l) + // use the fact that there are only three servers, so connecting the leader + // resolves all partitions (the last line of test also relies on this) + tester.AnnotateClearFailure() select { case <-done: case <-time.After(time.Second): - ts.Fatalf("Submit after healing didn't return") + text := "Submit after healing didn't return" + tester.AnnotateCheckerFailure(text, text) + ts.Fatalf(text) } // check that all replicas have the same value for counter @@ -136,27 +164,34 @@ func TestRestartReplay4A(t *testing.T) { ts := makeTest(t, -1) defer ts.cleanup() + tester.AnnotateTest("TestRestartReplay4A", NSRV) ts.Begin("Test Restart") for i := 0; i < NINC; i++ { r := ts.oneInc() if r.N != i+1 { - ts.t.Fatalf("expected %d instead of %d", i, r.N) + err := fmt.Sprintf("expected %d instead of %d", i, r.N) + tester.AnnotateCheckerFailure(err, err) + ts.t.Fatalf(err) } ts.checkCounter(r.N, NSRV) } ts.Group(Gid).Shutdown() + tester.AnnotateShutdownAll() time.Sleep(1 * time.Second) ts.Group(Gid).StartServers() + tester.AnnotateRestartAll() // submit an Inc r := ts.oneInc() if r.N != NINC+1 { - t.Fatalf("Expected %d got %d", NINC+1, r.N) + err := fmt.Sprintf("expected %d got %d", NINC+1, r.N) + tester.AnnotateCheckerFailure(err, err) + t.Fatalf(err) } time.Sleep(1 * time.Second) @@ -176,6 +211,7 @@ func TestShutdown4A(t *testing.T) { ts := makeTest(t, -1) defer ts.cleanup() + tester.AnnotateTest("TestShutdown4A", NSRV) ts.Begin("Test Shutdown") // Submit many Null's concurrently @@ -197,11 +233,14 @@ func TestShutdown4A(t *testing.T) { time.Sleep(20 * time.Millisecond) ts.Group(Gid).Shutdown() + tester.AnnotateShutdownAll() select { case <-done: case <-time.After((NSEC + 1) * time.Second): - ts.Fatalf("Submit didn't stop after shutdown") + err := "Submit didn't stop after shutdown" + tester.AnnotateCheckerFailure(err, err) + ts.Fatalf(err) } } @@ -216,32 +255,41 @@ func TestRestartSubmit4A(t *testing.T) { ts := makeTest(t, -1) defer ts.cleanup() + tester.AnnotateTest("TestRestartSubmit4A", NSRV) ts.Begin("Test Restart and submit") for i := 0; i < NINC; i++ { r := ts.oneInc() if r.N != i+1 { - ts.t.Fatalf("expected %d instead of %d", i, r.N) + err := fmt.Sprintf("expected %d instead of %d", i, r.N) + tester.AnnotateCheckerFailure(err, err) + ts.t.Fatalf(err) } ts.checkCounter(r.N, NSRV) } ts.Group(Gid).Shutdown() + tester.AnnotateShutdownAll() time.Sleep(1 * time.Second) ts.Group(Gid).StartServers() + tester.AnnotateRestartAll() // submit an Inc r := ts.oneInc() if r.N != NINC+1 { - t.Fatalf("Expected %d got %d", NINC+1, r.N) + err := fmt.Sprintf("Expected %d got %d", NINC+1, r.N) + tester.AnnotateCheckerFailure(err, err) + t.Fatalf(err) } time.Sleep(1 * time.Second) // Submit many Null's concurrently + text := fmt.Sprintf("concurrently submitting %v Null", NSUBMIT) + tester.AnnotateInfo(text, text) done := make(chan struct{}) go func() { var wg sync.WaitGroup @@ -260,14 +308,18 @@ func TestRestartSubmit4A(t *testing.T) { time.Sleep(20 * time.Millisecond) ts.Group(Gid).Shutdown() + tester.AnnotateShutdownAll() select { case <-done: case <-time.After((NSEC + 1) * time.Second): - ts.Fatalf("Submit didn't stop after shutdown") + err := "Submit didn't stop after shutdown" + tester.AnnotateCheckerFailure(err, err) + ts.Fatalf(err) } ts.Group(Gid).StartServers() + tester.AnnotateRestartAll() r = ts.oneInc() ts.checkCounter(r.N, NSRV) @@ -283,6 +335,9 @@ func TestSnapshot4C(t *testing.T) { ts := makeTest(t, MAXRAFTSTATE) defer ts.cleanup() + tester.AnnotateTest("TestSnapshot4C", NSRV) + ts.Begin("Test creating and restoring snapshot") + for i := 0; i < N; i++ { ts.oneInc() } @@ -290,14 +345,18 @@ func TestSnapshot4C(t *testing.T) { sz := ts.Group(Gid).LogSize() if sz > 2*MAXRAFTSTATE { - ts.Fatalf("logs were not trimmed (%v > 2 * %v)", sz, ts.maxraftstate) + err := fmt.Sprintf("logs were not trimmed (%v > 2 * %v)", sz, ts.maxraftstate) + tester.AnnotateCheckerFailure(err, err) + ts.Fatalf(err) } // rsm must have made snapshots by now; shutdown all servers and // restart them from a snapshot ts.g.Shutdown() + tester.AnnotateShutdownAll() ts.g.StartServers() + tester.AnnotateRestartAll() // make restarted servers do one increment ts.oneInc() diff --git a/src/kvraft1/rsm/test.go b/src/kvraft1/rsm/test.go index 1fb3f8d..681b101 100644 --- a/src/kvraft1/rsm/test.go +++ b/src/kvraft1/rsm/test.go @@ -5,6 +5,7 @@ import ( "sync" "testing" "time" + "fmt" "6.5840/kvsrv1/rpc" "6.5840/labrpc" @@ -115,6 +116,8 @@ func (ts *Test) checkCounter(v int, nsrv int) { for iters := 0; iters < 30; iters++ { n = ts.countValue(v) if n >= nsrv { + text := fmt.Sprintf("all %v servers have counter value %v", nsrv, v) + tester.AnnotateCheckerSuccess(text, text) return } time.Sleep(to) @@ -122,7 +125,9 @@ func (ts *Test) checkCounter(v int, nsrv int) { to *= 2 } } - ts.Fatalf("checkCounter: only %d srvs have %v instead of %d", n, v, nsrv) + err := fmt.Sprintf("checkCounter: only %d srvs have %v instead of %d", n, v, nsrv) + tester.AnnotateCheckerFailure(err, err) + ts.Fatalf(err) } func (ts *Test) countValue(v int) int { diff --git a/src/kvtest1/kvtest.go b/src/kvtest1/kvtest.go index f07edf3..ef38d35 100644 --- a/src/kvtest1/kvtest.go +++ b/src/kvtest1/kvtest.go @@ -3,6 +3,7 @@ package kvtest import ( "encoding/json" //"log" + "fmt" "math/rand" "strconv" "testing" @@ -78,9 +79,11 @@ func (ts *Test) MakeClerk() IKVClerk { // Assumes different ck's put to different keys func (ts *Test) PutAtLeastOnce(ck IKVClerk, key, value string, ver rpc.Tversion, me int) rpc.Tversion { + verPrev := ver for true { err := ts.Put(ck, key, value, ver, me) if err == rpc.OK { + ver += 1 break } if err == rpc.ErrMaybe || err == rpc.ErrVersion { @@ -92,17 +95,27 @@ func (ts *Test) PutAtLeastOnce(ck IKVClerk, key, value string, ver rpc.Tversion, } } } + desp := fmt.Sprintf("Put(%v, %v) completes", key, value) + details := fmt.Sprintf("version: %v -> %v", verPrev, ver) + tester.AnnotateInfo(desp, details) return ver } func (ts *Test) CheckGet(ck IKVClerk, key, value string, version rpc.Tversion) { + tester.AnnotateCheckerBegin(fmt.Sprintf("checking Get(%v) = (%v, %v)", key, value, version)) val, ver, err := ts.Get(ck, key, 0) if err != rpc.OK { - ts.Fatalf("CheckGet err %v", err) + text := fmt.Sprintf("Get(%v) returns error = %v", key, err) + tester.AnnotateCheckerFailure(text, text) + ts.Fatalf(text) } - if val != value || ver != ver { - ts.Fatalf("Get(%v): expected:\n%v %v\nreceived:\n%v %v", key, value, val, version, ver) + if val != value || ver != version { + text := fmt.Sprintf("Get(%v) returns (%v, %v) != (%v, %v)", key, val, ver, value, version) + tester.AnnotateCheckerFailure(text, text) + ts.Fatalf(text) } + text := fmt.Sprintf("Get(%v) returns (%v, %v) as expected", key, val, ver) + tester.AnnotateCheckerSuccess(text, "OK") } type ClntRes struct { @@ -214,9 +227,10 @@ func (ts *Test) OnePut(me int, ck IKVClerk, key string, ver rpc.Tversion) (rpc.T // repartition the servers periodically func (ts *Test) Partitioner(gid tester.Tgid, ch chan bool) { + //log.Printf("partioner %v", gid) defer func() { ch <- true }() for true { - switch { + select { case <-ch: return default: @@ -234,6 +248,7 @@ func (ts *Test) Partitioner(gid tester.Tgid, ch chan bool) { } } ts.Group(gid).Partition(pa[0], pa[1]) + tester.AnnotateTwoPartitions(pa[0], pa[1]) time.Sleep(ElectionTimeout + time.Duration(rand.Int63()%200)*time.Millisecond) } } diff --git a/src/kvtest1/porcupine.go b/src/kvtest1/porcupine.go index 75a96b2..0b3b079 100644 --- a/src/kvtest1/porcupine.go +++ b/src/kvtest1/porcupine.go @@ -83,9 +83,7 @@ func Put(cfg *tester.Config, ck IKVClerk, key string, value string, version rpc. // Checks that the log of Clerk.Put's and Clerk.Get's is linearizable (see // linearizability-faq.txt) -func checkPorcupine( - t *testing.T, opLog *OpLog, annotations []porcupine.Annotation, nsec time.Duration, -) { +func checkPorcupine(t *testing.T, opLog *OpLog, nsec time.Duration) { enabled := os.Getenv("VIS_ENABLE") fpath := os.Getenv("VIS_FILE") res, info := porcupine.CheckOperationsVerbose(models.KvModel, opLog.Read(), nsec) @@ -102,6 +100,7 @@ func checkPorcupine( fmt.Printf("info: failed to open visualization file %s (%v)\n", fpath, err) } else if enabled != "never" { // Don't produce visualization file if VIS_ENABLE is set to "never". + annotations := tester.FinalizeAnnotations("test failed") info.AddAnnotations(annotations) err = porcupine.Visualize(models.KvModel, info, file) if err != nil { @@ -116,7 +115,7 @@ func checkPorcupine( } // The result is either legal or unknown. - if enabled == "always" { + if enabled == "always" && tester.GetAnnotationFinalized() { var file *os.File var err error if fpath == "" { @@ -129,6 +128,7 @@ func checkPorcupine( fmt.Printf("info: failed to open visualization file %s (%v)\n", fpath, err) return } + annotations := tester.FinalizeAnnotations("test passed") info.AddAnnotations(annotations) err = porcupine.Visualize(models.KvModel, info, file) if err != nil { @@ -180,9 +180,8 @@ func (ts *Test) CheckPorcupine() { } func (ts *Test) CheckPorcupineT(nsec time.Duration) { - // ts.RetrieveAnnotations() also clears the accumulated annotations so that - // the vis file containing client operations (generated here) won't be + // tester.RetrieveAnnotations() also clears the accumulated annotations so + // that the vis file containing client operations (generated here) won't be // overridden by that without client operations (generated at cleanup time). - annotations := ts.RetrieveAnnotations() - checkPorcupine(ts.t, ts.oplog, annotations, nsec) + checkPorcupine(ts.t, ts.oplog, nsec) } diff --git a/src/shardkv1/shardctrler/shardctrler.go b/src/shardkv1/shardctrler/shardctrler.go index b0f351c..3d8743e 100644 --- a/src/shardkv1/shardctrler/shardctrler.go +++ b/src/shardkv1/shardctrler/shardctrler.go @@ -8,9 +8,9 @@ import ( "sync/atomic" + "6.5840/kvsrv1" "6.5840/kvsrv1/rpc" "6.5840/kvtest1" - "6.5840/shardkv1/kvsrv1" "6.5840/shardkv1/shardcfg" "6.5840/tester1" ) diff --git a/src/shardkv1/shardkv_test.go b/src/shardkv1/shardkv_test.go index 14d9ede..78f0f58 100644 --- a/src/shardkv1/shardkv_test.go +++ b/src/shardkv1/shardkv_test.go @@ -22,8 +22,8 @@ const ( // kvsrv1 lab. func TestInitQuery5A(t *testing.T) { - // MakeTest starts a key/value server using `kvsrv.StartKVServer`, - // which is defined in shardkv1/kvsrv1. + // MakeTest starts your lab2 key/value server using + // `kvsrv.StartKVServer`. ts := MakeTest(t, "Test (5A): Init and Query ...", true) defer ts.Cleanup() diff --git a/src/shardkv1/test.go b/src/shardkv1/test.go index 14974a8..2020f48 100644 --- a/src/shardkv1/test.go +++ b/src/shardkv1/test.go @@ -10,10 +10,10 @@ import ( "time" "6.5840/kvraft1/rsm" + "6.5840/kvsrv1" "6.5840/kvsrv1/rpc" "6.5840/kvtest1" "6.5840/labrpc" - "6.5840/shardkv1/kvsrv1" "6.5840/shardkv1/shardcfg" "6.5840/shardkv1/shardctrler" "6.5840/shardkv1/shardctrler/param" diff --git a/src/tester1/annotation.go b/src/tester1/annotation.go index b603ad2..dbbb19f 100644 --- a/src/tester1/annotation.go +++ b/src/tester1/annotation.go @@ -20,6 +20,7 @@ type Annotation struct { mu *sync.Mutex annotations []porcupine.Annotation continuous map[string]Continuous + finalized bool } type Continuous struct { @@ -70,8 +71,22 @@ const ( TAG_INFO string = "$ Test Info" ) -func (cfg *Config) RetrieveAnnotations() []porcupine.Annotation{ - annotations := annotation.retrieve() +func FinalizeAnnotations(end string) []porcupine.Annotation { + annotations := annotation.finalize() + + // XXX: Make the last annotation an interval one to work around Porcupine's + // issue. Consider removing this once the issue is fixed. + t := timestamp() + aend := porcupine.Annotation{ + Tag: TAG_INFO, + Start: t, + End: t + 1000, + Description: end, + Details: end, + BackgroundColor: COLOR_INFO, + } + annotations = append(annotations, aend) + return annotations } @@ -173,12 +188,39 @@ func AnnotateCheckerNeutral(desp, details string) { AnnotateCheckerEnd(desp, details, COLOR_NEUTRAL) } +func SetAnnotationFinalized() { + annotation.mu.Lock() + defer annotation.mu.Unlock() + + annotation.finalized = true +} + +func (an *Annotation) isFinalized() bool { + annotation.mu.Lock() + defer annotation.mu.Unlock() + + return annotation.finalized +} + +func GetAnnotationFinalized() bool { + return annotation.isFinalized() +} + // Used before log.Fatalf func AnnotateCheckerFailureBeforeExit(desp, details string) { AnnotateCheckerFailure(desp, details) annotation.cleanup(true, "test failed") } +// The current annotation API for failures is very hacky. We really should have +// just one function that reads the current network/server status. For network, +// we should be able to read whether an endname is enabled. However, if the +// endname is enabled from X to Y, but not Y to X, the annotation would be +// downright confusing. A better design (in the tester framework, not in the +// annotation layer) is to have a single boolean for each pair of servers; once +// we have such state, the annotation can then simply read the booleans to +// determine the network partitions. + // Two functions to annotate partitions: AnnotateConnection and // AnnotateTwoPartitions. The connected field of ServerGrp (in group.go) is // precise if and only if the ServerGrp.Partition is not used. Thus, we use the @@ -236,14 +278,31 @@ func annotateFault() { AnnotateContinuousColor(TAG_PARTITION, text, text, COLOR_FAULT) } +// Currently this API does not work with failed servers, nor with the connected +// fields of ServerGrp (in group.go). It is used specifically for +// ServerGrp.Partition. func AnnotateTwoPartitions(p1 []int, p2 []int) { // A bit hard to check whether the partition actually changes, so just // annotate on every invocation. - // TODO - text := fmt.Sprintf("%v %v", p1, p2) + text := fmt.Sprintf("partition = %v %v", p1, p2) AnnotateContinuousColor(TAG_PARTITION, text, text, COLOR_FAULT) } +func AnnotateClearFailure() { + finfo.mu.Lock() + defer finfo.mu.Unlock() + + for id := range(finfo.crashed) { + finfo.crashed[id] = false + } + + for id := range(finfo.connected) { + finfo.connected[id] = true + } + + AnnotateContinuousEnd(TAG_PARTITION) +} + func AnnotateShutdown(servers []int) { finfo.mu.Lock() defer finfo.mu.Unlock() @@ -316,9 +375,12 @@ func timestamp() int64 { return int64(time.Since(time.Unix(0, 0))) } -func (an *Annotation) retrieve() []porcupine.Annotation { +func (an *Annotation) finalize() []porcupine.Annotation { an.mu.Lock() + defer an.mu.Unlock() + x := an.annotations + t := timestamp() for tag, cont := range(an.continuous) { a := porcupine.Annotation{ @@ -331,9 +393,8 @@ func (an *Annotation) retrieve() []porcupine.Annotation { } x = append(x, a) } - an.annotations = make([]porcupine.Annotation, 0) - an.continuous = make(map[string]Continuous) - an.mu.Unlock() + + an.finalized = true return x } @@ -341,6 +402,7 @@ func (an *Annotation) clear() { an.mu.Lock() an.annotations = make([]porcupine.Annotation, 0) an.continuous = make(map[string]Continuous) + an.finalized = false an.mu.Unlock() } @@ -445,21 +507,22 @@ func (an *Annotation) annotateContinuousEnd(tag string) { func (an *Annotation) cleanup(failed bool, end string) { enabled := os.Getenv("VIS_ENABLE") - if enabled == "never" || (!failed && enabled != "always") { + if enabled == "never" || (!failed && enabled != "always") || an.isFinalized() { // Simply clean up the annotations without producing the vis file if - // VIS_ENABLE is set to "never", or if the test passes and VIS_ENABLE is - // not set to "always". + // VIS_ENABLE is set to "never", OR if the test passes AND VIS_ENABLE is + // not set to "always", OR the current test has already been finalized + // (because CheckPorcupine has already produced a vis file). an.clear() return } - annotations := an.retrieve() + annotations := an.finalize() if len(annotations) == 0 { // Skip empty annotations. return } - // XXX: Make the last annotation a interval one to work around Porcupine's + // XXX: Make the last annotation an interval one to work around Porcupine's // issue. Consider removing this once the issue is fixed. t := timestamp() aend := porcupine.Annotation{ @@ -499,6 +562,7 @@ func mkAnnotation() *Annotation { mu: new(sync.Mutex), annotations: make([]porcupine.Annotation, 0), continuous: make(map[string]Continuous), + finalized: false, } return &an diff --git a/src/tester1/group.go b/src/tester1/group.go index 6f0488b..28849fa 100644 --- a/src/tester1/group.go +++ b/src/tester1/group.go @@ -313,7 +313,7 @@ func (sg *ServerGrp) AllowServersExcept(l int) []int { } func (sg *ServerGrp) Partition(p1 []int, p2 []int) { - // log.Printf("partition servers into: %v %v\n", p1, p2) + //log.Printf("partition servers into: %v %v\n", p1, p2) for i := 0; i < len(p1); i++ { sg.disconnect(p1[i], p2) sg.connect(p1[i], p1)