diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go index b4fe0e5549c7c..006274757e66f 100644 --- a/src/runtime/mprof.go +++ b/src/runtime/mprof.go @@ -9,6 +9,7 @@ package runtime import ( "internal/abi" + "internal/goarch" "internal/profilerecord" "internal/runtime/atomic" "runtime/internal/sys" @@ -542,16 +543,14 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) { gp := getg() mp := acquirem() // we must not be preempted while accessing profstack - nstk := 1 + var nstk int if tracefpunwindoff() || gp.m.hasCgoOnStack() { - mp.profStack[0] = logicalStackSentinel if gp.m.curg == nil || gp.m.curg == gp { - nstk = callers(skip, mp.profStack[1:]) + nstk = callers(skip, mp.profStack) } else { - nstk = gcallers(gp.m.curg, skip, mp.profStack[1:]) + nstk = gcallers(gp.m.curg, skip, mp.profStack) } } else { - mp.profStack[0] = uintptr(skip) if gp.m.curg == nil || gp.m.curg == gp { if skip > 0 { // We skip one fewer frame than the provided value for frame @@ -559,12 +558,12 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) { // frame, whereas the saved frame pointer will give us the // caller's return address first (so, not including // saveblockevent) - mp.profStack[0] -= 1 + skip -= 1 } - nstk += fpTracebackPCs(unsafe.Pointer(getfp()), mp.profStack[1:]) + nstk = fpTracebackPartialExpand(skip, unsafe.Pointer(getfp()), mp.profStack) } else { - mp.profStack[1] = gp.m.curg.sched.pc - nstk += 1 + fpTracebackPCs(unsafe.Pointer(gp.m.curg.sched.bp), mp.profStack[2:]) + mp.profStack[0] = gp.m.curg.sched.pc + nstk = 1 + fpTracebackPartialExpand(skip, unsafe.Pointer(gp.m.curg.sched.bp), mp.profStack[1:]) } } @@ -572,6 +571,52 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) { releasem(mp) } +// fpTracebackPartialExpand records a call stack obtained starting from fp. +// This function will skip the given number of frames, properly accounting for +// inlining, and save remaining frames as "physical" return addresses. The +// consumer should later use CallersFrames or similar to expand inline frames. +func fpTracebackPartialExpand(skip int, fp unsafe.Pointer, pcBuf []uintptr) int { + var n int + lastFuncID := abi.FuncIDNormal + skipOrAdd := func(retPC uintptr) bool { + if skip > 0 { + skip-- + } else if n < len(pcBuf) { + pcBuf[n] = retPC + n++ + } + return n < len(pcBuf) + } + for n < len(pcBuf) && fp != nil { + // return addr sits one word above the frame pointer + pc := *(*uintptr)(unsafe.Pointer(uintptr(fp) + goarch.PtrSize)) + + if skip > 0 { + callPC := pc - 1 + fi := findfunc(callPC) + u, uf := newInlineUnwinder(fi, callPC) + for ; uf.valid(); uf = u.next(uf) { + sf := u.srcFunc(uf) + if sf.funcID == abi.FuncIDWrapper && elideWrapperCalling(lastFuncID) { + // ignore wrappers + } else if more := skipOrAdd(uf.pc + 1); !more { + return n + } + lastFuncID = sf.funcID + } + } else { + // We've skipped the desired number of frames, so no need + // to perform further inline expansion now. + pcBuf[n] = pc + n++ + } + + // follow the frame pointer to the next one + fp = unsafe.Pointer(*(*uintptr)(fp)) + } + return n +} + // lockTimer assists with profiling contention on runtime-internal locks. // // There are several steps between the time that an M experiences contention and @@ -1075,10 +1120,34 @@ type BlockProfileRecord struct { // the [testing] package's -test.blockprofile flag instead // of calling BlockProfile directly. func BlockProfile(p []BlockProfileRecord) (n int, ok bool) { - return blockProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) { - copyBlockProfileRecord(&p[0], r) - p = p[1:] + var m int + n, ok = blockProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) { + copyBlockProfileRecord(&p[m], r) + m++ }) + if ok { + expandFrames(p[:n]) + } + return +} + +func expandFrames(p []BlockProfileRecord) { + expandedStack := makeProfStack() + for i := range p { + cf := CallersFrames(p[i].Stack()) + j := 0 + for ; j < len(expandedStack); j++ { + f, more := cf.Next() + // f.PC is a "call PC", but later consumers will expect + // "return PCs" + expandedStack[j] = f.PC + 1 + if !more { + break + } + } + k := copy(p[i].Stack0[:], expandedStack[:j]) + clear(p[i].Stack0[k:]) + } } // blockProfileInternal returns the number of records n in the profile. If there @@ -1111,6 +1180,9 @@ func blockProfileInternal(size int, copyFn func(profilerecord.BlockProfileRecord return } +// copyBlockProfileRecord copies the sample values and call stack from src to dst. +// The call stack is copied as-is. The caller is responsible for handling inline +// expansion, needed when the call stack was collected with frame pointer unwinding. func copyBlockProfileRecord(dst *BlockProfileRecord, src profilerecord.BlockProfileRecord) { dst.Count = src.Count dst.Cycles = src.Cycles @@ -1123,7 +1195,11 @@ func copyBlockProfileRecord(dst *BlockProfileRecord, src profilerecord.BlockProf if asanenabled { asanwrite(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0)) } - i := fpunwindExpand(dst.Stack0[:], src.Stack) + // We just copy the stack here without inline expansion + // (needed if frame pointer unwinding is used) + // since this function is called under the profile lock, + // and doing something that might allocate can violate lock ordering. + i := copy(dst.Stack0[:], src.Stack) clear(dst.Stack0[i:]) } @@ -1142,10 +1218,15 @@ func pprof_blockProfileInternal(p []profilerecord.BlockProfileRecord) (n int, ok // Most clients should use the [runtime/pprof] package // instead of calling MutexProfile directly. func MutexProfile(p []BlockProfileRecord) (n int, ok bool) { - return mutexProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) { - copyBlockProfileRecord(&p[0], r) - p = p[1:] + var m int + n, ok = mutexProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) { + copyBlockProfileRecord(&p[m], r) + m++ }) + if ok { + expandFrames(p[:n]) + } + return } // mutexProfileInternal returns the number of records n in the profile. If there diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go index d3af5bba914ec..4b7a9f63c6500 100644 --- a/src/runtime/pprof/pprof.go +++ b/src/runtime/pprof/pprof.go @@ -404,6 +404,25 @@ type countProfile interface { Label(i int) *labelMap } +// expandInlinedFrames copies the call stack from pcs into dst, expanding any +// PCs corresponding to inlined calls into the corresponding PCs for the inlined +// functions. Returns the number of frames copied to dst. +func expandInlinedFrames(dst, pcs []uintptr) int { + cf := runtime.CallersFrames(pcs) + var n int + for n < len(dst) { + f, more := cf.Next() + // f.PC is a "call PC", but later consumers will expect + // "return PCs" + dst[n] = f.PC + 1 + n++ + if !more { + break + } + } + return n +} + // printCountCycleProfile outputs block profile records (for block or mutex profiles) // as the pprof-proto format output. Translations from cycle count to time duration // are done because The proto expects count and time (nanoseconds) instead of count @@ -426,7 +445,7 @@ func printCountCycleProfile(w io.Writer, countName, cycleName string, records [] values[1] = int64(float64(r.Cycles) / cpuGHz) // For count profiles, all stack addresses are // return PCs, which is what appendLocsForStack expects. - n := pprof_fpunwindExpand(expandedStack[:], r.Stack) + n := expandInlinedFrames(expandedStack, r.Stack) locs = b.appendLocsForStack(locs[:0], expandedStack[:n]) b.pbSample(values, locs, nil) } @@ -935,7 +954,7 @@ func writeProfileInternal(w io.Writer, debug int, name string, runtimeProfile fu for i := range p { r := &p[i] fmt.Fprintf(w, "%v %v @", r.Cycles, r.Count) - n := pprof_fpunwindExpand(expandedStack, r.Stack) + n := expandInlinedFrames(expandedStack, r.Stack) stack := expandedStack[:n] for _, pc := range stack { fmt.Fprintf(w, " %#x", pc) diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go index 09abbb31ae6cc..30ef50b1c0fa7 100644 --- a/src/runtime/pprof/pprof_test.go +++ b/src/runtime/pprof/pprof_test.go @@ -21,6 +21,7 @@ import ( "regexp" "runtime" "runtime/debug" + "strconv" "strings" "sync" "sync/atomic" @@ -2578,3 +2579,166 @@ func produceProfileEvents(t *testing.T, depth int) { runtime.GC() goroutineDeep(t, depth-4) // -4 for produceProfileEvents, **, chanrecv1, chanrev, gopark } + +func getProfileStacks(collect func([]runtime.BlockProfileRecord) (int, bool), fileLine bool) []string { + var n int + var ok bool + var p []runtime.BlockProfileRecord + for { + p = make([]runtime.BlockProfileRecord, n) + n, ok = collect(p) + if ok { + p = p[:n] + break + } + } + var stacks []string + for _, r := range p { + var stack strings.Builder + for i, pc := range r.Stack() { + if i > 0 { + stack.WriteByte('\n') + } + // Use FuncForPC instead of CallersFrames, + // because we want to see the info for exactly + // the PCs returned by the mutex profile to + // ensure inlined calls have already been properly + // expanded. + f := runtime.FuncForPC(pc - 1) + stack.WriteString(f.Name()) + if fileLine { + stack.WriteByte(' ') + file, line := f.FileLine(pc - 1) + stack.WriteString(file) + stack.WriteByte(':') + stack.WriteString(strconv.Itoa(line)) + } + } + stacks = append(stacks, stack.String()) + } + return stacks +} + +func TestMutexBlockFullAggregation(t *testing.T) { + // This regression test is adapted from + // https://github.com/grafana/pyroscope-go/issues/103, + // authored by Tolya Korniltsev + + var m sync.Mutex + + prev := runtime.SetMutexProfileFraction(-1) + defer runtime.SetMutexProfileFraction(prev) + + const fraction = 1 + const iters = 100 + const workers = 2 + + runtime.SetMutexProfileFraction(fraction) + runtime.SetBlockProfileRate(1) + defer runtime.SetBlockProfileRate(0) + + wg := sync.WaitGroup{} + wg.Add(workers) + for j := 0; j < workers; j++ { + go func() { + for i := 0; i < iters; i++ { + m.Lock() + // Wait at least 1 millisecond to pass the + // starvation threshold for the mutex + time.Sleep(time.Millisecond) + m.Unlock() + } + wg.Done() + }() + } + wg.Wait() + + assertNoDuplicates := func(name string, collect func([]runtime.BlockProfileRecord) (int, bool)) { + stacks := getProfileStacks(collect, true) + seen := make(map[string]struct{}) + for _, s := range stacks { + if _, ok := seen[s]; ok { + t.Errorf("saw duplicate entry in %s profile with stack:\n%s", name, s) + } + seen[s] = struct{}{} + } + if len(seen) == 0 { + t.Errorf("did not see any samples in %s profile for this test", name) + } + } + t.Run("mutex", func(t *testing.T) { + assertNoDuplicates("mutex", runtime.MutexProfile) + }) + t.Run("block", func(t *testing.T) { + assertNoDuplicates("block", runtime.BlockProfile) + }) +} + +func inlineA(mu *sync.Mutex, wg *sync.WaitGroup) { inlineB(mu, wg) } +func inlineB(mu *sync.Mutex, wg *sync.WaitGroup) { inlineC(mu, wg) } +func inlineC(mu *sync.Mutex, wg *sync.WaitGroup) { + defer wg.Done() + mu.Lock() + mu.Unlock() +} + +func inlineD(mu *sync.Mutex, wg *sync.WaitGroup) { inlineE(mu, wg) } +func inlineE(mu *sync.Mutex, wg *sync.WaitGroup) { inlineF(mu, wg) } +func inlineF(mu *sync.Mutex, wg *sync.WaitGroup) { + defer wg.Done() + mu.Unlock() +} + +func TestBlockMutexProfileInlineExpansion(t *testing.T) { + runtime.SetBlockProfileRate(1) + defer runtime.SetBlockProfileRate(0) + prev := runtime.SetMutexProfileFraction(1) + defer runtime.SetMutexProfileFraction(prev) + + var mu sync.Mutex + var wg sync.WaitGroup + wg.Add(2) + mu.Lock() + go inlineA(&mu, &wg) + awaitBlockedGoroutine(t, "sync.Mutex.Lock", "inlineC", 1) + // inlineD will unblock inlineA + go inlineD(&mu, &wg) + wg.Wait() + + tcs := []struct { + Name string + Collect func([]runtime.BlockProfileRecord) (int, bool) + SubStack string + }{ + { + Name: "mutex", + Collect: runtime.MutexProfile, + SubStack: `sync.(*Mutex).Unlock +runtime/pprof.inlineF +runtime/pprof.inlineE +runtime/pprof.inlineD`, + }, + { + Name: "block", + Collect: runtime.BlockProfile, + SubStack: `sync.(*Mutex).Lock +runtime/pprof.inlineC +runtime/pprof.inlineB +runtime/pprof.inlineA`, + }, + } + + for _, tc := range tcs { + t.Run(tc.Name, func(t *testing.T) { + stacks := getProfileStacks(tc.Collect, false) + for _, s := range stacks { + if strings.Contains(s, tc.SubStack) { + return + } + } + t.Error("did not see expected stack") + t.Logf("wanted:\n%s", tc.SubStack) + t.Logf("got: %s", stacks) + }) + } +}