diff --git a/ebpf/bpf/profile.bpf.c b/ebpf/bpf/profile.bpf.c index 748abd4ad1..a9b0437a71 100644 --- a/ebpf/bpf/profile.bpf.c +++ b/ebpf/bpf/profile.bpf.c @@ -5,6 +5,9 @@ #include "bpf_tracing.h" #include "profile.bpf.h" #include "pid.h" +#include "ume.h" + +#define PF_KTHREAD 0x00200000 SEC("perf_event") int do_perf_event(struct bpf_perf_event_data *ctx) { @@ -14,9 +17,20 @@ int do_perf_event(struct bpf_perf_event_data *ctx) { struct sample_key key = {}; u32 *val, one = 1; - if (tgid == 0) { + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + if (tgid == 0 || task == 0) { + return 0; + } + int flags = 0; + if (pyro_bpf_core_read(&flags, sizeof(flags), &task->flags)) { + bpf_dbg_printk("failed to read task->flags\n"); return 0; } + if (flags & PF_KTHREAD) { + bpf_dbg_printk("skipping kthread %d\n", tgid); + return 0; + } + struct pid_config *config = bpf_map_lookup_elem(&pids, &tgid); if (config == NULL) { struct pid_config unknown = { @@ -25,7 +39,10 @@ int do_perf_event(struct bpf_perf_event_data *ctx) { .collect_user = 0, .padding_ = 0 }; - bpf_map_update_elem(&pids, &tgid, &unknown, BPF_NOEXIST); + if (bpf_map_update_elem(&pids, &tgid, &unknown, BPF_NOEXIST)) { + bpf_dbg_printk("failed to update pids map. probably concurrent update\n"); + return 0; + } struct pid_event event = { .op = OP_REQUEST_UNKNOWN_PROCESS_INFO, .pid = tgid diff --git a/ebpf/pyrobpf/profile_bpfel_arm64.o b/ebpf/pyrobpf/profile_bpfel_arm64.o index ed6322c75c..fa5f868173 100644 Binary files a/ebpf/pyrobpf/profile_bpfel_arm64.o and b/ebpf/pyrobpf/profile_bpfel_arm64.o differ diff --git a/ebpf/pyrobpf/profile_bpfel_x86.o b/ebpf/pyrobpf/profile_bpfel_x86.o index 4d805b2890..84c77a44dd 100644 Binary files a/ebpf/pyrobpf/profile_bpfel_x86.o and b/ebpf/pyrobpf/profile_bpfel_x86.o differ diff --git a/ebpf/session.go b/ebpf/session.go index 3e622890c9..a55ca7c7e1 100644 --- a/ebpf/session.go +++ b/ebpf/session.go @@ -745,6 +745,44 @@ func (s *session) cleanup() { } } } + + if s.roundNumber%10 == 0 { + s.checkStalePids() + } +} + +// iterate over all pids and check if they are alive +// it is only needed in case disassociate_ctty hook somehow mises a process death +func (s *session) checkStalePids() { + var ( + m = s.bpf.Pids + mapSize = m.MaxEntries() + nextKey = uint32(0) + ) + keys := make([]uint32, mapSize) + values := make([]pyrobpf.ProfilePidConfig, mapSize) + n, err := m.BatchLookup(nil, &nextKey, keys, values, new(ebpf.BatchOptions)) + _ = level.Debug(s.logger).Log("msg", "check stale pids", "count", n) + for i := 0; i < n; i++ { + _, err := os.Stat(fmt.Sprintf("/proc/%d/status", keys[i])) + if err != nil { + if !errors.Is(err, os.ErrNotExist) { + _ = level.Error(s.logger).Log("msg", "check stale pids", "err", err) + } + if err := m.Delete(keys[i]); err != nil && !errors.Is(err, ebpf.ErrKeyNotExist) { + _ = level.Error(s.logger).Log("msg", "delete stale pid", "pid", keys[i], "err", err) + } + _ = level.Debug(s.logger).Log("msg", "stale pid deleted", "pid", keys[i]) + continue + } else { + _ = level.Debug(s.logger).Log("msg", "stale pid check : alive", "pid", keys[i], "config", fmt.Sprintf("%+v", values[i])) + } + } + if err != nil { + if !errors.Is(err, ebpf.ErrKeyNotExist) { + _ = level.Error(s.logger).Log("msg", "check stale pids", "err", err) + } + } } type stackBuilder struct { diff --git a/ebpf/symtab/elf.go b/ebpf/symtab/elf.go index 2208a937bf..c82fae43f9 100644 --- a/ebpf/symtab/elf.go +++ b/ebpf/symtab/elf.go @@ -92,8 +92,7 @@ func (et *ElfTable) load() { me, err := elf2.NewMMapedElfFile(fsElfFilePath) if err != nil { - et.err = err - et.onLoadError() + et.onLoadError(err) return } defer me.Close() // todo do not close if it is the selected elf @@ -104,8 +103,7 @@ func (et *ElfTable) load() { } buildID, err := me.BuildID() if err != nil && !errors.Is(err, elf2.ErrNoBuildIDSection) { - et.err = err - et.onLoadError() + et.onLoadError(err) return } @@ -117,8 +115,7 @@ func (et *ElfTable) load() { } fileInfo, err := os.Stat(fsElfFilePath) if err != nil { - et.err = err - et.onLoadError() + et.onLoadError(err) return } symbols = et.options.ElfCache.GetSymbolsByStat(statFromFileInfo(fileInfo)) @@ -132,16 +129,14 @@ func (et *ElfTable) load() { if debugFilePath != "" { debugMe, err := elf2.NewMMapedElfFile(path.Join(et.fs, debugFilePath)) if err != nil { - et.err = err - et.onLoadError() + et.onLoadError(err) return } defer debugMe.Close() // todo do not close if it is the selected elf symbols, err = et.createSymbolTable(debugMe) if err != nil { - et.err = err - et.onLoadError() + et.onLoadError(err) return } et.table = symbols @@ -152,8 +147,7 @@ func (et *ElfTable) load() { symbols, err = et.createSymbolTable(me) level.Debug(et.logger).Log("msg", "create symbol table", "f", me.FilePath()) if err != nil { - et.err = err - et.onLoadError() + et.onLoadError(err) return } @@ -326,8 +320,17 @@ func (et *ElfTable) DebugInfo() elf2.SymTabDebugInfo { return et.table.DebugInfo() } -func (et *ElfTable) onLoadError() { - level.Error(et.logger).Log("msg", "failed to load elf table", "err", et.err, +func (et *ElfTable) onLoadError(err error) { + et.err = err + var l log.Logger + if errors.Is(err, os.ErrNotExist) { + l = level.Debug(et.logger) + } else { + l = level.Error(et.logger) + } + l.Log( + "msg", "failed to load elf table", + "err", et.err, "f", et.elfFilePath, "fs", et.fs) if et.options.Metrics != nil {