diff --git a/src/runtime/runtime-gdb_unix_test.go b/src/runtime/runtime-gdb_unix_test.go index 5413306f772da3..23eb3e2362f184 100644 --- a/src/runtime/runtime-gdb_unix_test.go +++ b/src/runtime/runtime-gdb_unix_test.go @@ -20,6 +20,43 @@ import ( "testing" ) +func canGenerateCore(t *testing.T) bool { + // Ensure there is enough RLIMIT_CORE available to generate a full core. + var lim syscall.Rlimit + err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim) + if err != nil { + t.Fatalf("error getting rlimit: %v", err) + } + // Minimum RLIMIT_CORE max to allow. This is a conservative estimate. + // Most systems allow infinity. + const minRlimitCore = 100 << 20 // 100 MB + if lim.Max < minRlimitCore { + t.Skipf("RLIMIT_CORE max too low: %#+v", lim) + } + + // Make sure core pattern will send core to the current directory. + b, err := os.ReadFile("/proc/sys/kernel/core_pattern") + if err != nil { + t.Fatalf("error reading core_pattern: %v", err) + } + if string(b) != "core\n" { + t.Skipf("Unexpected core pattern %q", string(b)) + } + + coreUsesPID := false + b, err = os.ReadFile("/proc/sys/kernel/core_uses_pid") + if err == nil { + switch string(bytes.TrimSpace(b)) { + case "0": + case "1": + coreUsesPID = true + default: + t.Skipf("unexpected core_uses_pid value %q", string(b)) + } + } + return coreUsesPID +} + const coreSignalSource = ` package main @@ -81,45 +118,12 @@ func TestGdbCoreSignalBacktrace(t *testing.T) { t.Parallel() checkGdbVersion(t) - // Ensure there is enough RLIMIT_CORE available to generate a full core. - var lim syscall.Rlimit - err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim) - if err != nil { - t.Fatalf("error getting rlimit: %v", err) - } - // Minimum RLIMIT_CORE max to allow. This is a conservative estimate. - // Most systems allow infinity. - const minRlimitCore = 100 << 20 // 100 MB - if lim.Max < minRlimitCore { - t.Skipf("RLIMIT_CORE max too low: %#+v", lim) - } - - // Make sure core pattern will send core to the current directory. - b, err := os.ReadFile("/proc/sys/kernel/core_pattern") - if err != nil { - t.Fatalf("error reading core_pattern: %v", err) - } - if string(b) != "core\n" { - t.Skipf("Unexpected core pattern %q", string(b)) - } - - coreUsesPID := false - b, err = os.ReadFile("/proc/sys/kernel/core_uses_pid") - if err == nil { - switch string(bytes.TrimSpace(b)) { - case "0": - case "1": - coreUsesPID = true - default: - t.Skipf("unexpected core_uses_pid value %q", string(b)) - } - } - - dir := t.TempDir() + coreUsesPID := canGenerateCore(t) // Build the source code. + dir := t.TempDir() src := filepath.Join(dir, "main.go") - err = os.WriteFile(src, []byte(coreSignalSource), 0644) + err := os.WriteFile(src, []byte(coreSignalSource), 0644) if err != nil { t.Fatalf("failed to create file: %v", err) } @@ -230,3 +234,143 @@ func TestGdbCoreSignalBacktrace(t *testing.T) { t.Fatalf("could not find runtime symbol in backtrace after signal handler:\n%s", rest) } } + +const coreCrashThreadSource = ` +package main + +/* +#cgo CFLAGS: -g -O0 +#include +#include +void trigger_crash() +{ + int* ptr = NULL; + *ptr = 1024; +} +*/ +import "C" +import ( + "flag" + "fmt" + "os" + "runtime/debug" + "syscall" +) + +func enableCore() { + debug.SetTraceback("crash") + + var lim syscall.Rlimit + err := syscall.Getrlimit(syscall.RLIMIT_CORE, &lim) + if err != nil { + panic(fmt.Sprintf("error getting rlimit: %v", err)) + } + lim.Cur = lim.Max + fmt.Fprintf(os.Stderr, "Setting RLIMIT_CORE = %+#v\n", lim) + err = syscall.Setrlimit(syscall.RLIMIT_CORE, &lim) + if err != nil { + panic(fmt.Sprintf("error setting rlimit: %v", err)) + } +} + +func main() { + flag.Parse() + + enableCore() + + C.trigger_crash() +} +` + +// TestGdbCoreCrashThreadBacktrace tests that runtime could let the fault thread to crash process +// and make fault thread as number one thread while gdb in a core file +func TestGdbCoreCrashThreadBacktrace(t *testing.T) { + if runtime.GOOS != "linux" { + // N.B. This test isn't fundamentally Linux-only, but it needs + // to know how to enable/find core files on each OS. + t.Skip("Test only supported on Linux") + } + if runtime.GOARCH != "386" && runtime.GOARCH != "amd64" { + // TODO(go.dev/issue/25218): Other architectures use sigreturn + // via VDSO, which we somehow don't handle correctly. + t.Skip("Backtrace through signal handler only works on 386 and amd64") + } + + checkGdbEnvironment(t) + t.Parallel() + checkGdbVersion(t) + + coreUsesPID := canGenerateCore(t) + + // Build the source code. + dir := t.TempDir() + src := filepath.Join(dir, "main.go") + err := os.WriteFile(src, []byte(coreCrashThreadSource), 0644) + if err != nil { + t.Fatalf("failed to create file: %v", err) + } + cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe", "main.go") + cmd.Dir = dir + out, err := testenv.CleanCmdEnv(cmd).CombinedOutput() + if err != nil { + t.Fatalf("building source %v\n%s", err, out) + } + + // Start the test binary. + cmd = testenv.Command(t, "./a.exe") + cmd.Dir = dir + var output bytes.Buffer + cmd.Stdout = &output // for test logging + cmd.Stderr = &output + + if err := cmd.Start(); err != nil { + t.Fatalf("error starting test binary: %v", err) + } + + pid := cmd.Process.Pid + + err = cmd.Wait() + t.Logf("child output:\n%s", output.String()) + if err == nil { + t.Fatalf("Wait succeeded, want SIGABRT") + } + ee, ok := err.(*exec.ExitError) + if !ok { + t.Fatalf("Wait err got %T %v, want exec.ExitError", ee, ee) + } + ws, ok := ee.Sys().(syscall.WaitStatus) + if !ok { + t.Fatalf("Sys got %T %v, want syscall.WaitStatus", ee.Sys(), ee.Sys()) + } + if ws.Signal() != syscall.SIGABRT { + t.Fatalf("Signal got %d want SIGABRT", ws.Signal()) + } + if !ws.CoreDump() { + t.Fatalf("CoreDump got %v want true", ws.CoreDump()) + } + + coreFile := "core" + if coreUsesPID { + coreFile += fmt.Sprintf(".%d", pid) + } + + // Execute gdb commands. + args := []string{"-nx", "-batch", + "-iex", "add-auto-load-safe-path " + filepath.Join(testenv.GOROOT(t), "src", "runtime"), + "-ex", "backtrace", + filepath.Join(dir, "a.exe"), + filepath.Join(dir, coreFile), + } + cmd = testenv.Command(t, "gdb", args...) + + got, err := cmd.CombinedOutput() + t.Logf("gdb output:\n%s", got) + if err != nil { + t.Fatalf("gdb exited with error: %v", err) + } + + re := regexp.MustCompile(`#.* trigger_crash`) + if found := re.Find(got) != nil; !found { + t.Fatalf("could not find trigger_crash in backtrace") + } +} diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go index cd9fd5d796571b..84391d58ed7f22 100644 --- a/src/runtime/signal_unix.go +++ b/src/runtime/signal_unix.go @@ -597,7 +597,7 @@ func adjustSignalStack(sig uint32, mp *m, gsigStack *gsignalStack) bool { // crashing is the number of m's we have waited for when implementing // GOTRACEBACK=crash when a signal is received. -var crashing int32 +var crashing atomic.Int32 // testSigtrap and testSigusr1 are used by the runtime tests. If // non-nil, it is called on SIGTRAP/SIGUSR1. If it returns true, the @@ -730,7 +730,7 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) { mp.throwing = throwTypeRuntime mp.caughtsig.set(gp) - if crashing == 0 { + if crashing.Load() == 0 { startpanic_m() } @@ -740,11 +740,11 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) { if level > 0 { goroutineheader(gp) tracebacktrap(c.sigpc(), c.sigsp(), c.siglr(), gp) - if crashing > 0 && gp != mp.curg && mp.curg != nil && readgstatus(mp.curg)&^_Gscan == _Grunning { + if crashing.Load() > 0 && gp != mp.curg && mp.curg != nil && readgstatus(mp.curg)&^_Gscan == _Grunning { // tracebackothers on original m skipped this one; trace it now. goroutineheader(mp.curg) traceback(^uintptr(0), ^uintptr(0), 0, mp.curg) - } else if crashing == 0 { + } else if crashing.Load() == 0 { tracebackothers(gp) print("\n") } @@ -752,20 +752,35 @@ func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) { } if docrash { - crashing++ - if crashing < mcount()-int32(extraMLength.Load()) { + isCrashThread := false + if crashing.CompareAndSwap(0, 1) { + isCrashThread = true + } else { + crashing.Add(1) + } + if crashing.Load() < mcount()-int32(extraMLength.Load()) { // There are other m's that need to dump their stacks. // Relay SIGQUIT to the next m by sending it to the current process. // All m's that have already received SIGQUIT have signal masks blocking // receipt of any signals, so the SIGQUIT will go to an m that hasn't seen it yet. - // When the last m receives the SIGQUIT, it will fall through to the call to - // crash below. Just in case the relaying gets botched, each m involved in + // The first m will wait until all ms received the SIGQUIT, then crash/exit. + // Just in case the relaying gets botched, each m involved in // the relay sleeps for 5 seconds and then does the crash/exit itself. - // In expected operation, the last m has received the SIGQUIT and run - // crash/exit and the process is gone, all long before any of the - // 5-second sleeps have finished. + // The faulting m is crashing first so it is the faulting thread in the core dump (see issue #63277): + // in expected operation, the first m will wait until the last m has received the SIGQUIT, + // and then run crash/exit and the process is gone. + // However, if it spends more than 5 seconds to send SIGQUIT to all ms, + // any of ms may crash/exit the process after waiting for 5 seconds. print("\n-----\n\n") raiseproc(_SIGQUIT) + } + if isCrashThread { + i := 0 + for (crashing.Load() < mcount()-int32(extraMLength.Load())) && i < 10 { + i++ + usleep(500 * 1000) + } + } else { usleep(5 * 1000 * 1000) } printDebugLog()