open-telemetry · christos68k · Jan 24, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025
@@ -28,7 +28,6 @@ import (
 	eim "go.opentelemetry.io/ebpf-profiler/processmanager/execinfomanager"
 	"go.opentelemetry.io/ebpf-profiler/reporter"
 	"go.opentelemetry.io/ebpf-profiler/times"
-	"go.opentelemetry.io/ebpf-profiler/tracehandler"
 	"go.opentelemetry.io/ebpf-profiler/traceutil"
 	"go.opentelemetry.io/ebpf-profiler/util"
 )
@@ -348,34 +347,6 @@ func (pm *ProcessManager) MaybeNotifyAPMAgent(
 	return serviceName
 }
 
-func (pm *ProcessManager) SymbolizationComplete(traceCaptureKTime times.KTime) {
-	pm.mu.Lock()
-	defer pm.mu.Unlock()
-
-	nowKTime := times.GetKTime()
-	log.Debugf("SymbolizationComplete captureKT: %v latency: %v ms",
-		traceCaptureKTime, (nowKTime-traceCaptureKTime)/1e6)
-
-	for pid, pidExitKTime := range pm.exitEvents {
-		if pidExitKTime > traceCaptureKTime {
-			continue
-		}
-		for _, instance := range pm.interpreters[pid] {
-			if err := instance.Detach(pm.ebpf, pid); err != nil {
-				log.Errorf("Failed to handle interpreted process exit for PID %d: %v",
-					pid, err)
-			}
-		}
-		delete(pm.interpreters, pid)
-		delete(pm.exitEvents, pid)
-
-		log.Debugf("PID %v exit latency %v ms", pid, (nowKTime-pidExitKTime)/1e6)
-	}
-}
-
-// Compile time check to make sure we satisfy the interface.
-var _ tracehandler.TraceProcessor = (*ProcessManager)(nil)
-
 // AddSynthIntervalData adds synthetic stack deltas to the manager. This is useful for cases where
 // populating the information via the stack delta provider isn't viable, for example because the
 // `.eh_frame` section for a binary is broken. If `AddSynthIntervalData` was called for a given

@@ -602,7 +602,7 @@ func TestProcExit(t *testing.T) {
 
 			populateManager(t, manager)
 
-			_ = manager.ProcessPIDExit(testcase.pid)
+			manager.ProcessPIDExit(testcase.pid)
 			assert.Equal(t, testcase.deletePidPageMappingCount,
 				ebpfMockup.deletePidPageMappingCount)
 			assert.Equal(t, testcase.deleteStackDeltaRangesCount,

@@ -32,6 +32,7 @@ import (
 	"go.opentelemetry.io/ebpf-profiler/reporter"
 	"go.opentelemetry.io/ebpf-profiler/times"
 	"go.opentelemetry.io/ebpf-profiler/tpbase"
+	"go.opentelemetry.io/ebpf-profiler/tracehandler"
 	"go.opentelemetry.io/ebpf-profiler/util"
 )
 
@@ -506,32 +507,36 @@ func (pm *ProcessManager) synchronizeMappings(pr process.Process,
 	return newProcess
 }
 
-// ProcessPIDExit informs the ProcessManager that a process exited and no longer will be scheduled
-// for processing. It also schedules immediate symbolization if the exited PID needs it. exitKTime
-// is stored for later processing in SymbolizationComplete when all traces have been collected.
-// There can be a race condition if we can not clean up the references for this process
+// ProcessPIDExit informs the ProcessManager that a process exited and no longer will be scheduled.
+// exitKTime is stored for later processing in ProcessedUntil, when traces up to this time have been
+// processed. There can be a race condition if we can not clean up the references for this process
 // fast enough and this particular pid is reused again by the system.
 // NOTE: Exported only for tracer.
-func (pm *ProcessManager) ProcessPIDExit(pid libpf.PID) bool {
+func (pm *ProcessManager) ProcessPIDExit(pid libpf.PID) {
+	exitKTime := times.GetKTime()
 	log.Debugf("- PID: %v", pid)
 	defer pm.ebpf.RemoveReportedPID(pid)
 
 	pm.mu.Lock()
 	defer pm.mu.Unlock()
 
-	symbolize := false
-	exitKTime := times.GetKTime()
-	if pm.interpreterTracerEnabled {
-		if len(pm.interpreters[pid]) > 0 {
+	pidExited := false
+	info, pidExists := pm.pidToProcessInfo[pid]
+	if pidExists || (pm.interpreterTracerEnabled &&
+		len(pm.interpreters[pid]) > 0) {
+		// ProcessPIDExit may be called multiple times in short succession
+		// for the same PID, don't update exitKTime if we've previously recorded it.
+		if _, pidExited = pm.exitEvents[pid]; !pidExited {
 			pm.exitEvents[pid] = exitKTime
-			symbolize = true
 		}
 	}
-
-	info, ok := pm.pidToProcessInfo[pid]
-	if !ok {
+	if !pidExists {
 		log.Debugf("Skip process exit handling for unknown PID %d", pid)
-		return symbolize
+		return
+	}
+	if pidExited {
+		log.Debugf("Skip duplicate process exit handling for PID %d", pid)
+		return
 	}
 
 	// Delete all entries we have for this particular PID from pid_page_to_mapping_info.
@@ -548,9 +553,6 @@ func (pm *ProcessManager) ProcessPIDExit(pid libpf.PID) bool {
 				address, pid, err)
 		}
 	}
-	delete(pm.pidToProcessInfo, pid)
-
-	return symbolize
 }
 
 func (pm *ProcessManager) SynchronizeProcess(pr process.Process) {
@@ -670,3 +672,34 @@ func (pm *ProcessManager) ExePathForPID(pid libpf.PID) string {
 	}
 	return executable
 }
+
+func (pm *ProcessManager) ProcessedUntil(traceCaptureKTime times.KTime) {
+	pm.mu.Lock()
+	defer pm.mu.Unlock()
+
+	nowKTime := times.GetKTime()
+	log.Debugf("ProcessedUntil captureKT: %v latency: %v ms",
+		traceCaptureKTime, (nowKTime-traceCaptureKTime)/1e6)
+
-	pm.mu.Lock()
-	defer pm.mu.Unlock()
-
-	nowKTime := times.GetKTime()
-	log.Debugf("ProcessedUntil captureKT: %v latency: %v ms",
-		traceCaptureKTime, (nowKTime-traceCaptureKTime)/1e6)
+	nowKTime := times.GetKTime()
+	log.Debugf("ProcessedUntil captureKT: %v latency: %v ms",
+		traceCaptureKTime, (nowKTime-traceCaptureKTime)/1e6)
+
+	pm.mu.Lock()
+	defer pm.mu.Unlock()
-	pm.mu.Lock()
-	defer pm.mu.Unlock()
-
-	nowKTime := times.GetKTime()
-	log.Debugf("ProcessedUntil captureKT: %v latency: %v ms",
-		traceCaptureKTime, (nowKTime-traceCaptureKTime)/1e6)
+	nowKTime := times.GetKTime()
+	log.Debugf("ProcessedUntil captureKT: %v latency: %v ms",
+		traceCaptureKTime, (nowKTime-traceCaptureKTime)/1e6)
+
+	pm.mu.Lock()
+	defer pm.mu.Unlock()
+	for pid, pidExitKTime := range pm.exitEvents {
+		if pidExitKTime > traceCaptureKTime {
+			continue
+		}
+
+		delete(pm.pidToProcessInfo, pid)
+
+		for _, instance := range pm.interpreters[pid] {
+			if err := instance.Detach(pm.ebpf, pid); err != nil {
+				log.Errorf("Failed to handle interpreted process exit for PID %d: %v",
+					pid, err)
+			}
+		}
+		delete(pm.interpreters, pid)
+		delete(pm.exitEvents, pid)
+
+		log.Debugf("PID %v exit latency %v ms", pid, (nowKTime-pidExitKTime)/1e6)
+	}
+}
+
+// Compile time check to make sure we satisfy the interface.
+var _ tracehandler.TraceProcessor = (*ProcessManager)(nil)
@@ -47,11 +47,11 @@ type TraceProcessor interface {
 	// the frame and send the associated metadata to the collection agent.
 	ConvertTrace(trace *host.Trace) *libpf.Trace
 
-	// SymbolizationComplete is called after a group of Trace has been symbolized.
+	// ProcessedUntil is called periodically after Traces are processed/symbolized.
 	// It gets the timestamp of when the Traces (if any) were captured. The timestamp
 	// is in essence an indicator that all Traces until that time have been now processed,
-	// and any events up to this time can be processed.
-	SymbolizationComplete(traceCaptureKTime times.KTime)
+	// and any events and cleanup actions up to this time can be processed.
+	ProcessedUntil(traceCaptureKTime times.KTime)
 }
 
 // traceHandler provides functions for handling new traces and trace count updates

@@ -40,7 +40,7 @@ func (f *fakeTraceProcessor) ConvertTrace(trace *host.Trace) *libpf.Trace {
 	return &newTrace
 }
 
-func (f *fakeTraceProcessor) SymbolizationComplete(times.KTime) {}
+func (f *fakeTraceProcessor) ProcessedUntil(times.KTime) {}
 
 func (f *fakeTraceProcessor) MaybeNotifyAPMAgent(*host.Trace, libpf.TraceHash, uint16) string {
 	return ""

@@ -190,9 +190,9 @@ func (t *Tracer) startTraceEventMonitor(ctx context.Context,
 				traceOutChan <- trace
 			}
 			// After we've received and processed all trace events, call
-			// SymbolizationComplete if there is a pending oldKTime that we
+			// ProcessedUntil if there is a pending oldKTime that we
 			// haven't yet propagated to the rest of the agent.
-			// This introduces both an upper bound to SymbolizationComplete
+			// This introduces both an upper bound to ProcessedUntil
 			// call frequency (dictated by pollTicker) but also skips calls
 			// when none are needed (e.g. no trace events have been read).
 			//
@@ -206,18 +206,18 @@ func (t *Tracer) startTraceEventMonitor(ctx context.Context,
 			// timestamps t0 < t1 < t2 < t3, this poll loop reads [t3 t1 t2]
 			// in a first iteration and [t0] in a second iteration. If we use
 			// the current iteration minKTime we'll call
-			// SymbolizationComplete(t1) first and t0 next, with t0 < t1.
+			// ProcessedUntil(t1) first and t0 next, with t0 < t1.
 			if oldKTime > 0 {
 				// Ensure that all previously sent trace events have been processed
 				traceOutChan <- nil
 
 				if minKTime > 0 && minKTime <= oldKTime {
 					// If minKTime is smaller than oldKTime, use it and reset it
 					// to avoid a repeat during next iteration.
-					t.TraceProcessor().SymbolizationComplete(minKTime)
+					t.TraceProcessor().ProcessedUntil(minKTime)
 					minKTime = 0
 				} else {
-					t.TraceProcessor().SymbolizationComplete(oldKTime)
+					t.TraceProcessor().ProcessedUntil(oldKTime)
 				}
 			}
 			oldKTime = minKTime