diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go index b4fe0e5549c7c..e5620aec11dfd 100644 --- a/src/runtime/mprof.go +++ b/src/runtime/mprof.go @@ -9,6 +9,7 @@ package runtime import ( "internal/abi" + "internal/goarch" "internal/profilerecord" "internal/runtime/atomic" "runtime/internal/sys" @@ -542,16 +543,14 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) { gp := getg() mp := acquirem() // we must not be preempted while accessing profstack - nstk := 1 + var nstk int if tracefpunwindoff() || gp.m.hasCgoOnStack() { - mp.profStack[0] = logicalStackSentinel if gp.m.curg == nil || gp.m.curg == gp { - nstk = callers(skip, mp.profStack[1:]) + nstk = callers(skip, mp.profStack) } else { - nstk = gcallers(gp.m.curg, skip, mp.profStack[1:]) + nstk = gcallers(gp.m.curg, skip, mp.profStack) } } else { - mp.profStack[0] = uintptr(skip) if gp.m.curg == nil || gp.m.curg == gp { if skip > 0 { // We skip one fewer frame than the provided value for frame @@ -559,12 +558,12 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) { // frame, whereas the saved frame pointer will give us the // caller's return address first (so, not including // saveblockevent) - mp.profStack[0] -= 1 + skip -= 1 } - nstk += fpTracebackPCs(unsafe.Pointer(getfp()), mp.profStack[1:]) + nstk = fpTracebackPartialExpand(skip, unsafe.Pointer(getfp()), mp.profStack) } else { - mp.profStack[1] = gp.m.curg.sched.pc - nstk += 1 + fpTracebackPCs(unsafe.Pointer(gp.m.curg.sched.bp), mp.profStack[2:]) + mp.profStack[0] = gp.m.curg.sched.pc + nstk = 1 + fpTracebackPartialExpand(skip, unsafe.Pointer(gp.m.curg.sched.bp), mp.profStack[1:]) } } @@ -572,6 +571,52 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) { releasem(mp) } +// fpTracebackPartialExpand records a call stack obtained starting from fp. +// This function will skip the given number of frames, properly accounting for +// inlining, and save remaining frames as "physical" return addresses. The +// consumer should later use CallersFrames or similar to expand inline frames. +func fpTracebackPartialExpand(skip int, fp unsafe.Pointer, pcBuf []uintptr) int { + var n int + lastFuncID := abi.FuncIDNormal + skipOrAdd := func(retPC uintptr) bool { + if skip > 0 { + skip-- + } else if n < len(pcBuf) { + pcBuf[n] = retPC + n++ + } + return n < len(pcBuf) + } + for n < len(pcBuf) && fp != nil { + // return addr sits one word above the frame pointer + pc := *(*uintptr)(unsafe.Pointer(uintptr(fp) + goarch.PtrSize)) + + if skip > 0 { + callPC := pc - 1 + fi := findfunc(callPC) + u, uf := newInlineUnwinder(fi, callPC) + for ; uf.valid(); uf = u.next(uf) { + sf := u.srcFunc(uf) + if sf.funcID == abi.FuncIDWrapper && elideWrapperCalling(lastFuncID) { + // ignore wrappers + } else if more := skipOrAdd(uf.pc + 1); !more { + return n + } + lastFuncID = sf.funcID + } + } else { + // We've skipped the desired number of frames, so no need + // to perform further inline expansion now. + pcBuf[n] = pc + n++ + } + + // follow the frame pointer to the next one + fp = unsafe.Pointer(*(*uintptr)(fp)) + } + return n +} + // lockTimer assists with profiling contention on runtime-internal locks. // // There are several steps between the time that an M experiences contention and @@ -1075,10 +1120,34 @@ type BlockProfileRecord struct { // the [testing] package's -test.blockprofile flag instead // of calling BlockProfile directly. func BlockProfile(p []BlockProfileRecord) (n int, ok bool) { - return blockProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) { + n, ok = blockProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) { copyBlockProfileRecord(&p[0], r) p = p[1:] }) + if !ok { + return + } + expandFrames(p[:n]) + return +} + +func expandFrames(p []BlockProfileRecord) { + expandedStack := makeProfStack() + for i := range p { + cf := CallersFrames(p[i].Stack()) + j := 0 + for ; j < len(expandedStack); j++ { + f, more := cf.Next() + // f.PC is a "call PC", but later consumers will expect + // "return PCs" + expandedStack[i] = f.PC + 1 + if !more { + break + } + } + k := copy(p[i].Stack0[:], expandedStack[:j]) + clear(p[i].Stack0[k:]) + } } // blockProfileInternal returns the number of records n in the profile. If there @@ -1111,6 +1180,9 @@ func blockProfileInternal(size int, copyFn func(profilerecord.BlockProfileRecord return } +// copyBlockProfileRecord copies the sample values and call stack from src to dst. +// The call stack is copied as-is. The caller is responsible for handling inline +// expansion, needed when the call stack was collected with frame pointer unwinding. func copyBlockProfileRecord(dst *BlockProfileRecord, src profilerecord.BlockProfileRecord) { dst.Count = src.Count dst.Cycles = src.Cycles @@ -1123,7 +1195,11 @@ func copyBlockProfileRecord(dst *BlockProfileRecord, src profilerecord.BlockProf if asanenabled { asanwrite(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0)) } - i := fpunwindExpand(dst.Stack0[:], src.Stack) + // We just copy the stack here without inline expansion + // (needed if frame pointer unwinding is used) + // since this function is called under the profile lock, + // and doing something that might allocate can violate lock ordering. + i := copy(dst.Stack0[:], src.Stack) clear(dst.Stack0[i:]) } @@ -1142,10 +1218,15 @@ func pprof_blockProfileInternal(p []profilerecord.BlockProfileRecord) (n int, ok // Most clients should use the [runtime/pprof] package // instead of calling MutexProfile directly. func MutexProfile(p []BlockProfileRecord) (n int, ok bool) { - return mutexProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) { + n, ok = mutexProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) { copyBlockProfileRecord(&p[0], r) p = p[1:] }) + if !ok { + return + } + expandFrames(p[:n]) + return } // mutexProfileInternal returns the number of records n in the profile. If there diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go index d3af5bba914ec..4b7a9f63c6500 100644 --- a/src/runtime/pprof/pprof.go +++ b/src/runtime/pprof/pprof.go @@ -404,6 +404,25 @@ type countProfile interface { Label(i int) *labelMap } +// expandInlinedFrames copies the call stack from pcs into dst, expanding any +// PCs corresponding to inlined calls into the corresponding PCs for the inlined +// functions. Returns the number of frames copied to dst. +func expandInlinedFrames(dst, pcs []uintptr) int { + cf := runtime.CallersFrames(pcs) + var n int + for n < len(dst) { + f, more := cf.Next() + // f.PC is a "call PC", but later consumers will expect + // "return PCs" + dst[n] = f.PC + 1 + n++ + if !more { + break + } + } + return n +} + // printCountCycleProfile outputs block profile records (for block or mutex profiles) // as the pprof-proto format output. Translations from cycle count to time duration // are done because The proto expects count and time (nanoseconds) instead of count @@ -426,7 +445,7 @@ func printCountCycleProfile(w io.Writer, countName, cycleName string, records [] values[1] = int64(float64(r.Cycles) / cpuGHz) // For count profiles, all stack addresses are // return PCs, which is what appendLocsForStack expects. - n := pprof_fpunwindExpand(expandedStack[:], r.Stack) + n := expandInlinedFrames(expandedStack, r.Stack) locs = b.appendLocsForStack(locs[:0], expandedStack[:n]) b.pbSample(values, locs, nil) } @@ -935,7 +954,7 @@ func writeProfileInternal(w io.Writer, debug int, name string, runtimeProfile fu for i := range p { r := &p[i] fmt.Fprintf(w, "%v %v @", r.Cycles, r.Count) - n := pprof_fpunwindExpand(expandedStack, r.Stack) + n := expandInlinedFrames(expandedStack, r.Stack) stack := expandedStack[:n] for _, pc := range stack { fmt.Fprintf(w, " %#x", pc) diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go index 09abbb31ae6cc..bd11914544fbc 100644 --- a/src/runtime/pprof/pprof_test.go +++ b/src/runtime/pprof/pprof_test.go @@ -2578,3 +2578,81 @@ func produceProfileEvents(t *testing.T, depth int) { runtime.GC() goroutineDeep(t, depth-4) // -4 for produceProfileEvents, **, chanrecv1, chanrev, gopark } + +func TestMutexBlockFullAggregation(t *testing.T) { + // This regression test is adapted from + // https://github.com/grafana/pyroscope-go/issues/103, + // authored by Tolya Korniltsev + + var m sync.Mutex + + prev := runtime.SetMutexProfileFraction(-1) + defer runtime.SetMutexProfileFraction(prev) + + const fraction = 1 + const iters = 100 + const workers = 2 + + runtime.SetMutexProfileFraction(fraction) + runtime.SetBlockProfileRate(1) + defer runtime.SetBlockProfileRate(0) + + wg := sync.WaitGroup{} + wg.Add(workers) + for j := 0; j < workers; j++ { + go func() { + for i := 0; i < iters; i++ { + m.Lock() + // Wait at least 1 millisecond to pass the + // starvation threshold for the mutex + time.Sleep(time.Millisecond) + m.Unlock() + } + wg.Done() + }() + } + wg.Wait() + + assertNoDuplicates := func(name string, collect func([]runtime.BlockProfileRecord) (int, bool)) { + var p []runtime.BlockProfileRecord + n, ok := collect(nil) + for { + p = make([]runtime.BlockProfileRecord, n+50) + n, ok = collect(p) + if ok { + p = p[:n] + break + } + } + seen := make(map[string]struct{}) + for _, r := range p { + cf := runtime.CallersFrames(r.Stack()) + var stack strings.Builder + for { + f, more := cf.Next() + stack.WriteString(f.Func.Name()) + if !more { + break + } + stack.WriteByte('\n') + } + s := stack.String() + if !strings.Contains(s, "TestMutexBlockFullAggregation") { + continue + } + if _, ok := seen[s]; ok { + t.Errorf("saw duplicate entry in %s profile with stack:\n%s", name, s) + } + seen[s] = struct{}{} + } + if len(seen) == 0 { + t.Errorf("did not see any samples in %s profile for this test", name) + } + } + t.Run("mutex", func(t *testing.T) { + assertNoDuplicates("mutex", runtime.MutexProfile) + }) + t.Run("block", func(t *testing.T) { + assertNoDuplicates("block", runtime.BlockProfile) + }) +}