Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests: preserve logs on LibGoalFixture failure #6030

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions agreement/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ func (s *Service) Start() {
//
// This method returns after all resources have been cleaned up.
func (s *Service) Shutdown() {
s.log.Debug("agreement service is stopping")
defer s.log.Debug("agreement service has stopped")

close(s.quit)
s.quitFn()
<-s.done
Expand Down
3 changes: 3 additions & 0 deletions catchup/catchpointService.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,9 @@
// Stop stops the catchpoint catchup service - unlike Abort, this is not intended to abort the process but rather to allow
// cleanup of in-memory resources for the purpose of clean shutdown.
func (cs *CatchpointCatchupService) Stop() {
cs.log.Debug("catchpoint service is stopping")
defer cs.log.Debug("catchpoint service has stopped")

Check warning on line 189 in catchup/catchpointService.go

View check run for this annotation

Codecov / codecov/patch

catchup/catchpointService.go#L188-L189

Added lines #L188 - L189 were not covered by tests

// signal the running goroutine that we want to stop
cs.cancelCtxFunc()
// wait for the running goroutine to terminate.
Expand Down
3 changes: 3 additions & 0 deletions catchup/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@ func (s *Service) Start() {

// Stop informs the catchup service that it should stop, and waits for it to stop (when periodicSync() exits)
func (s *Service) Stop() {
s.log.Debug("catchup service is stopping")
defer s.log.Debug("catchup service has stopped")

s.cancel()
s.workers.Wait()
if s.initialSyncNotified.CompareAndSwap(0, 1) {
Expand Down
3 changes: 3 additions & 0 deletions data/txHandler.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,9 @@ func (handler *TxHandler) Start() {

// Stop suspends the processing of incoming messages at the transaction handler
func (handler *TxHandler) Stop() {
logging.Base().Debug("transaction handler is stopping")
defer logging.Base().Debug("transaction handler is stopping")

handler.ctxCancel()
if handler.erl != nil {
handler.erl.Stop()
Expand Down
15 changes: 10 additions & 5 deletions netdeploy/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,13 +415,14 @@

// Stop the network, ensuring primary relay stops first
// No return code - we try to kill them if we can (if we read valid PID file)
func (n Network) Stop(binDir string) {
c := make(chan struct{}, len(n.cfg.RelayDirs)+len(n.nodeDirs))
func (n Network) Stop(binDir string) (err error) {
c := make(chan error, len(n.cfg.RelayDirs)+len(n.nodeDirs))

Check warning on line 419 in netdeploy/network.go

View check run for this annotation

Codecov / codecov/patch

netdeploy/network.go#L418-L419

Added lines #L418 - L419 were not covered by tests
stopNodeContoller := func(nc *nodecontrol.NodeController) {
var stopErr error

Check warning on line 421 in netdeploy/network.go

View check run for this annotation

Codecov / codecov/patch

netdeploy/network.go#L421

Added line #L421 was not covered by tests
defer func() {
c <- struct{}{}
c <- stopErr

Check warning on line 423 in netdeploy/network.go

View check run for this annotation

Codecov / codecov/patch

netdeploy/network.go#L423

Added line #L423 was not covered by tests
}()
nc.FullStop()
stopErr = nc.FullStop()

Check warning on line 425 in netdeploy/network.go

View check run for this annotation

Codecov / codecov/patch

netdeploy/network.go#L425

Added line #L425 was not covered by tests
}
for _, relayDir := range n.cfg.RelayDirs {
relayDataDir := n.getNodeFullPath(relayDir)
Expand All @@ -439,9 +440,13 @@
}
// wait until we finish stopping all the node controllers.
for i := cap(c); i > 0; i-- {
<-c
stopErr := <-c
if stopErr != nil {
err = stopErr

Check warning on line 445 in netdeploy/network.go

View check run for this annotation

Codecov / codecov/patch

netdeploy/network.go#L443-L445

Added lines #L443 - L445 were not covered by tests
}
}
close(c)
return err

Check warning on line 449 in netdeploy/network.go

View check run for this annotation

Codecov / codecov/patch

netdeploy/network.go#L449

Added line #L449 was not covered by tests
}

// NetworkNodeStatus represents the result from checking the status of a particular node instance
Expand Down
3 changes: 3 additions & 0 deletions network/wsNetwork.go
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,9 @@ func (wn *WebsocketNetwork) innerStop() {
// Stop closes network connections and stops threads.
// Stop blocks until all activity on this node is done.
func (wn *WebsocketNetwork) Stop() {
wn.log.Debug("network is stopping")
defer wn.log.Debug("network has stopped")

wn.handler.ClearHandlers([]Tag{})

// if we have a working ticker, just stop it and clear it out. The access to this variable is safe since the Start()/Stop() are synced by the
Expand Down
2 changes: 2 additions & 0 deletions node/impls.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ func (i blockAuthenticatorImpl) Authenticate(block *bookkeeping.Block, cert *agr
}

func (i blockAuthenticatorImpl) Quit() {
logging.Base().Debug("block authenticator is stopping")
defer logging.Base().Debug("block authenticator has stopped")
i.AsyncVoteVerifier.Quit()
}

Expand Down
7 changes: 7 additions & 0 deletions node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,8 @@ func (node *AlgorandFullNode) startMonitoringRoutines() {
// waitMonitoringRoutines waits for all the monitoring routines to exit. Note that
// the node.mu must not be taken, and that the node's context should have been canceled.
func (node *AlgorandFullNode) waitMonitoringRoutines() {
node.log.Debug("waiting on node monitoring routines to exit")
defer node.log.Debug("done waiting on node monitoring routines to exit")
node.monitoringRoutinesWaitGroup.Wait()
}

Expand All @@ -409,6 +411,9 @@ func (node *AlgorandFullNode) ListeningAddress() (string, bool) {

// Stop stops running the node. Once a node is closed, it can never start again.
func (node *AlgorandFullNode) Stop() {
node.log.Debug("algorand node is stopping")
defer node.log.Debug("algorand node has stopped")

node.mu.Lock()
defer func() {
node.mu.Unlock()
Expand All @@ -431,9 +436,11 @@ func (node *AlgorandFullNode) Stop() {
node.ledgerService.Stop()
}
node.catchupBlockAuth.Quit()
node.log.Debug("crypto worker pools are stopping")
node.highPriorityCryptoVerificationPool.Shutdown()
node.lowPriorityCryptoVerificationPool.Shutdown()
node.cryptoPool.Shutdown()
node.log.Debug("crypto worker pools have stopped")
node.cancelCtx()
}

Expand Down
9 changes: 9 additions & 0 deletions nodecontrol/algodControl.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@
return fmt.Sprintf("no running node in directory '%s'", e.algodDataDir)
}

// NodeKilledError thrown when StopAlgod is called but the node was killed by SIGKILL instead of a clean shutdown with SIGTERM
type NodeKilledError struct {
}

func (e *NodeKilledError) Error() string {
return "node was killed"

Check warning on line 57 in nodecontrol/algodControl.go

View check run for this annotation

Codecov / codecov/patch

nodecontrol/algodControl.go#L56-L57

Added lines #L56 - L57 were not covered by tests
}

// MissingDataDirError thrown when StopAlgod is called but requested directory does not exist
type MissingDataDirError struct {
algodDataDir string
Expand Down Expand Up @@ -176,6 +184,7 @@
if killed {
// delete the pid file.
os.Remove(nc.algodPidFile)
return &NodeKilledError{}

Check warning on line 187 in nodecontrol/algodControl.go

View check run for this annotation

Codecov / codecov/patch

nodecontrol/algodControl.go#L187

Added line #L187 was not covered by tests
}
} else {
return &NodeNotRunningError{algodDataDir: nc.algodDataDir}
Expand Down
3 changes: 3 additions & 0 deletions rpcs/blockService.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,9 @@ func (bs *BlockService) Start() {

// Stop servicing catchup requests over ws
func (bs *BlockService) Stop() {
bs.log.Debug("block service is stopping")
defer bs.log.Debug("block service has stopped")

bs.mu.Lock()
close(bs.stop)
bs.mu.Unlock()
Expand Down
3 changes: 3 additions & 0 deletions rpcs/ledgerService.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ func (ls *LedgerService) Start() {
// Stop servicing catchup requests
func (ls *LedgerService) Stop() {
if ls.enableService {
logging.Base().Debug("ledger service is stopping")
defer logging.Base().Debug("ledger service has stopped")

ls.running.Store(0)
ls.stopping.Wait()
}
Expand Down
3 changes: 3 additions & 0 deletions rpcs/txSyncer.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ func (syncer *TxSyncer) Start(canStart chan struct{}) {

// Stop stops periodic syncing
func (syncer *TxSyncer) Stop() {
syncer.log.Debug("transaction syncer is stopping")
defer syncer.log.Debug("transaction syncer has stopped")

syncer.cancel()
syncer.wg.Wait()
}
Expand Down
3 changes: 3 additions & 0 deletions stateproof/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ func (spw *Worker) initDb(inMemory bool) error {
// Stop stops any goroutines associated with this worker. It is the caller responsibility to remove the register
// network handlers
func (spw *Worker) Stop() {
spw.log.Debug("stateproof worker is stopping")
defer spw.log.Debug("stateproof worker has stopped")

spw.shutdown()
spw.wg.Wait()

Expand Down
17 changes: 17 additions & 0 deletions test/e2e-go/features/catchup/basicCatchup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,15 @@ func runCatchupOverGossip(t fixtures.TestingTB,
a.NoError(err)
a.Empty(cfg.NetworkProtocolVersion)
cfg.NetworkProtocolVersion = ledgerNodeDowngradeTo
cfg.BaseLoggerDebugLevel = 5 // debug logging while debugging this test
cfg.SaveToDisk(dir)
} else {
// TODO: remove when TestCatchupOverGossip is fixed
dir, err := fixture.GetNodeDir("Node")
a.NoError(err)
cfg, err := config.LoadConfigFromDisk(dir)
a.NoError(err)
cfg.BaseLoggerDebugLevel = 5 // debug logging while debugging this test
cfg.SaveToDisk(dir)
}

Expand All @@ -138,6 +147,14 @@ func runCatchupOverGossip(t fixtures.TestingTB,
a.NoError(err)
a.Empty(cfg.NetworkProtocolVersion)
cfg.NetworkProtocolVersion = fetcherNodeDowngradeTo
cfg.BaseLoggerDebugLevel = 5 // debug logging while debugging this test
cfg.SaveToDisk(dir)
} else {
// TODO: remove when TestCatchupOverGossip is fixed
dir := fixture.PrimaryDataDir()
cfg, err := config.LoadConfigFromDisk(dir)
a.NoError(err)
cfg.BaseLoggerDebugLevel = 5 // debug logging while debugging this test
cfg.SaveToDisk(dir)
}

Expand Down
37 changes: 24 additions & 13 deletions test/framework/fixtures/libgoalFixture.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ func (f *LibGoalFixture) nodeExitWithError(nc *nodecontrol.NodeController, err e
}

debugLog := func() {
fmt.Fprintf(os.Stderr, "Node at %s has terminated with an error: %v. Dumping logs...", nc.GetDataDir(), err)
fmt.Fprintf(os.Stderr, "Node at %s has terminated with an error: %v. Dumping logs...\n", nc.GetDataDir(), err)
f.dumpLogs(filepath.Join(nc.GetDataDir(), "node.log"))
}

Expand All @@ -158,7 +158,7 @@ func (f *LibGoalFixture) nodeExitWithError(nc *nodecontrol.NodeController, err e
ws := exitError.Sys().(syscall.WaitStatus)
exitCode := ws.ExitStatus()

fmt.Fprintf(os.Stderr, "Node at %s has terminated with error code %d (%v)", nc.GetDataDir(), exitCode, *exitError)
fmt.Fprintf(os.Stderr, "Node at %s has terminated with error code %d (%v)\n", nc.GetDataDir(), exitCode, *exitError)
debugLog()
require.NoError(f.t, err)
}
Expand Down Expand Up @@ -345,22 +345,33 @@ func (f *LibGoalFixture) Shutdown() {
func (f *LibGoalFixture) ShutdownImpl(preserveData bool) {
f.NC.StopKMD()
if preserveData {
f.network.Stop(f.binDir)
err := f.network.Stop(f.binDir)
if err != nil {
f.t.Logf("Fixture %s shutdown caught a network stop error: %v", f.Name, err)
}
for _, relayDir := range f.RelayDataDirs() {
f.dumpLogs(filepath.Join(relayDir, "node.log"))
}
for _, nodeDir := range f.NodeDataDirs() {
f.dumpLogs(filepath.Join(nodeDir, "node.log"))
}
} else {
f.network.Delete(f.binDir)

// Remove the test dir, if it was created by us as a temporary
// directory and it is empty. If there's anything still in the
// test dir, os.Remove()'s rmdir will fail and have no effect;
// we ignore this error.
if f.testDirTmp {
os.Remove(f.testDir)
err := f.network.Stop(f.binDir)
if err == nil {
// no error, proceed with cleanup
delErr := f.network.Delete(f.binDir)
if delErr != nil {
f.t.Logf("Fixture %s shutdown caught a network delete error: %v", f.Name, delErr)
}
// Remove the test dir, if it was created by us as a temporary
// directory and it is empty. If there's anything still in the
// test dir, os.Remove()'s rmdir will fail and have no effect;
// we ignore this error.
if f.testDirTmp {
os.Remove(f.testDir)
}
} else {
f.t.Logf("Fixture %s shutdown caught a network stop error: %v", f.Name, err)
}
}
}
Expand All @@ -369,14 +380,14 @@ func (f *LibGoalFixture) ShutdownImpl(preserveData bool) {
func (f *LibGoalFixture) dumpLogs(filePath string) {
file, err := os.Open(filePath)
if err != nil {
fmt.Fprintf(os.Stderr, "could not open %s", filePath)
fmt.Fprintf(os.Stderr, "could not open %s\n", filePath)
return
}
defer file.Close()

fmt.Fprintf(os.Stderr, "=================================\n")
parts := strings.Split(filePath, "/")
fmt.Fprintf(os.Stderr, "%s/%s:", parts[len(parts)-2], parts[len(parts)-1]) // Primary/node.log
fmt.Fprintf(os.Stderr, "%s/%s:\n", parts[len(parts)-2], parts[len(parts)-1]) // Primary/node.log
scanner := bufio.NewScanner(file)
for scanner.Scan() {
fmt.Fprint(os.Stderr, scanner.Text())
Expand Down
Loading