Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Go error tracking #1004

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions bpf/errors.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef __ERRORS_H_
#define __ERRORS_H_

#ifndef TASK_COMM_LEN
#define TASK_COMM_LEN 16
#endif

#ifndef ERR_MSG_LEN
#define ERR_MSG_LEN 128
#endif

#ifndef MAX_STACK_DEPTH
#define MAX_STACK_DEPTH 32
#endif

typedef __u64 stack_trace_t[MAX_STACK_DEPTH];

typedef struct error_event {
__u32 pid;
__u32 cpu_id;
char comm[TASK_COMM_LEN];
__s32 ustack_sz;
stack_trace_t ustack;
u8 err_msg[ERR_MSG_LEN];
} error_event;

#endif /* __ERRORS_H_ */
88 changes: 87 additions & 1 deletion bpf/go_nethttp.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "tracing.h"
#include "hpack.h"
#include "ringbuf.h"
#include "errors.h"

typedef struct http_client_data {
u8 method[METHOD_MAX_LEN];
Expand Down Expand Up @@ -57,6 +58,13 @@ struct {
__uint(max_entries, MAX_CONCURRENT_REQUESTS);
} ongoing_http_server_requests SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__type(key, go_addr_key_t); // key: pointer to the request goroutine
__type(value, struct error_event);
__uint(max_entries, MAX_CONCURRENT_REQUESTS);
} last_error SEC(".maps");

/* HTTP Server */

// This instrumentation attaches uprobe to the following function:
Expand Down Expand Up @@ -246,6 +254,9 @@ int uprobe_ServeHTTPReturns(struct pt_regs *ctx) {
make_tp_string(tp_buf, &invocation->tp);
bpf_dbg_printk("tp: %s", tp_buf);

struct error_event *error = bpf_map_lookup_elem(&last_error, &g_key);
bpf_map_delete_elem(&last_error, &g_key);

http_request_trace *trace = bpf_ringbuf_reserve(&events, sizeof(http_request_trace), 0);
if (!trace) {
bpf_dbg_printk("can't reserve space in the ringbuffer");
Expand All @@ -256,7 +267,9 @@ int uprobe_ServeHTTPReturns(struct pt_regs *ctx) {
trace->type = EVENT_HTTP_REQUEST;
trace->start_monotime_ns = invocation->start_monotime_ns;
trace->end_monotime_ns = bpf_ktime_get_ns();

if (error) {
trace->error = *error;
}
goroutine_metadata *g_metadata = bpf_map_lookup_elem(&ongoing_goroutines, &g_key);
if (g_metadata) {
trace->go_start_monotime_ns = g_metadata->timestamp;
Expand Down Expand Up @@ -473,6 +486,79 @@ int uprobe_roundTripReturn(struct pt_regs *ctx) {
return 0;
}

SEC("uprobe/error")
int uprobe_error(struct pt_regs *ctx) {
bpf_dbg_printk("=== uprobe/proc error === ");

void *goroutine_addr = GOROUTINE_PTR(ctx);
go_addr_key_t g_key = {};
go_addr_key_from_id(&g_key, goroutine_addr);
bpf_dbg_printk("goroutine_addr %lx", goroutine_addr);

int pid = bpf_get_current_pid_tgid() >> 32;
int cpu_id = bpf_get_smp_processor_id();
int BPF_F_USER_STACK = (1ULL << 8);
struct error_event event = {
.pid = pid,
.cpu_id = cpu_id,
};

if (bpf_get_current_comm(event.comm, sizeof(event.comm))) {
event.comm[0] = 0;
}

// Read the stack trace
event.ustack_sz = bpf_get_stack(ctx, event.ustack, sizeof(event.ustack), BPF_F_USER_STACK);

// Get the caller of the error function and store it in the first slot of the stack
void *sp_caller = STACK_PTR(ctx);
u64 caller = 0;
bpf_probe_read(&caller, sizeof(u64), sp_caller);
bpf_dbg_printk("sp_caller %lx caller %lx", sp_caller, caller);
event.ustack[0] = caller;

// Write event
if (bpf_map_update_elem(&last_error, &g_key, &event, BPF_ANY)) {
bpf_dbg_printk("can't update event error map element");
}
return 0;
}

SEC("uprobe/error_return")
int uprobe_errorReturn(struct pt_regs *ctx) {
bpf_dbg_printk("=== uprobe/proc error return === ");

void *goroutine_addr = GOROUTINE_PTR(ctx);
go_addr_key_t g_key = {};
go_addr_key_from_id(&g_key, goroutine_addr);
bpf_dbg_printk("goroutine_addr %lx", goroutine_addr);

error_event *event = bpf_map_lookup_elem(&last_error, &g_key);
if (event == NULL) {
bpf_dbg_printk("can't read error event");
return 0;
}

// Read the error message
// GO_PARAM1(ctx) is the pointer to the error message
// GO_PARAM2(ctx) is the length of the error message
void *msg_ptr = GO_PARAM1(ctx);
u64 len = (u64)GO_PARAM2(ctx);
u64 max_size = sizeof(event->err_msg);
u64 size = max_size < len ? max_size : len;
bpf_probe_read(&event->err_msg, size, msg_ptr);
if (size < max_size) {
((char *)event->err_msg)[size] = 0;
}
bpf_dbg_printk("error msg %llx, %s", msg_ptr, event->err_msg);

// Write event
if (bpf_map_update_elem(&last_error, &g_key, event, BPF_ANY)) {
bpf_dbg_printk("can't update event error map element");
}
return 0;
}

#ifndef NO_HEADER_PROPAGATION
// Context propagation through HTTP headers
SEC("uprobe/header_writeSubset")
Expand Down
45 changes: 23 additions & 22 deletions bpf/headers/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,35 +21,36 @@

#if defined(__TARGET_ARCH_x86)

#define GO_PARAM1(x) ((void*)(x)->ax)
#define GO_PARAM2(x) ((void*)(x)->bx)
#define GO_PARAM3(x) ((void*)(x)->cx)
#define GO_PARAM4(x) ((void*)(x)->di)
#define GO_PARAM5(x) ((void*)(x)->si)
#define GO_PARAM6(x) ((void*)(x)->r8)
#define GO_PARAM7(x) ((void*)(x)->r9)
#define GO_PARAM8(x) ((void*)(x)->r10)
#define GO_PARAM9(x) ((void*)(x)->r11)
#define GO_PARAM1(x) ((void *)(x)->ax)
#define GO_PARAM2(x) ((void *)(x)->bx)
#define GO_PARAM3(x) ((void *)(x)->cx)
#define GO_PARAM4(x) ((void *)(x)->di)
#define GO_PARAM5(x) ((void *)(x)->si)
#define GO_PARAM6(x) ((void *)(x)->r8)
#define GO_PARAM7(x) ((void *)(x)->r9)
#define GO_PARAM8(x) ((void *)(x)->r10)
#define GO_PARAM9(x) ((void *)(x)->r11)

// In x86, current goroutine is pointed by r14, according to
// https://go.googlesource.com/go/+/refs/heads/dev.regabi/src/cmd/compile/internal-abi.md#amd64-architecture
#define GOROUTINE_PTR(x) ((void*)(x)->r14)

#define GOROUTINE_PTR(x) ((void *)(x)->r14)
#define STACK_PTR(x) ((void *)(x)->sp)
#elif defined(__TARGET_ARCH_arm64)

#define GO_PARAM1(x) ((void*)((PT_REGS_ARM64 *)(x))->regs[0])
#define GO_PARAM2(x) ((void*)((PT_REGS_ARM64 *)(x))->regs[1])
#define GO_PARAM3(x) ((void*)((PT_REGS_ARM64 *)(x))->regs[2])
#define GO_PARAM4(x) ((void*)((PT_REGS_ARM64 *)(x))->regs[3])
#define GO_PARAM5(x) ((void*)((PT_REGS_ARM64 *)(x))->regs[4])
#define GO_PARAM6(x) ((void*)((PT_REGS_ARM64 *)(x))->regs[5])
#define GO_PARAM7(x) ((void*)((PT_REGS_ARM64 *)(x))->regs[6])
#define GO_PARAM8(x) ((void*)((PT_REGS_ARM64 *)(x))->regs[7])
#define GO_PARAM9(x) ((void*)((PT_REGS_ARM64 *)(x))->regs[8])
#define GO_PARAM1(x) ((void *)((PT_REGS_ARM64 *)(x))->regs[0])
#define GO_PARAM2(x) ((void *)((PT_REGS_ARM64 *)(x))->regs[1])
#define GO_PARAM3(x) ((void *)((PT_REGS_ARM64 *)(x))->regs[2])
#define GO_PARAM4(x) ((void *)((PT_REGS_ARM64 *)(x))->regs[3])
#define GO_PARAM5(x) ((void *)((PT_REGS_ARM64 *)(x))->regs[4])
#define GO_PARAM6(x) ((void *)((PT_REGS_ARM64 *)(x))->regs[5])
#define GO_PARAM7(x) ((void *)((PT_REGS_ARM64 *)(x))->regs[6])
#define GO_PARAM8(x) ((void *)((PT_REGS_ARM64 *)(x))->regs[7])
#define GO_PARAM9(x) ((void *)((PT_REGS_ARM64 *)(x))->regs[8])

// In arm64, current goroutine is pointed by R28 according to
// https://github.com/golang/go/blob/master/src/cmd/compile/abi-internal.md#arm64-architecture
#define GOROUTINE_PTR(x) ((void*)((PT_REGS_ARM64 *)(x))->regs[28])
#define GOROUTINE_PTR(x) ((void *)((PT_REGS_ARM64 *)(x))->regs[28])
#define STACK_PTR(x) ((void *)((PT_REGS_ARM64 *)(x))->regs[13])

#endif /*defined(__TARGET_ARCH_arm64)*/

Expand All @@ -58,5 +59,5 @@
"%0 = %[max]\n" \
: "+r"(VAR) \
: [max] "i"(UMAX))

#endif /* __UTILS_H__ */
2 changes: 2 additions & 0 deletions bpf/tracer_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "pid_types.h"
#include "utils.h"
#include "errors.h"
#include "http_types.h"

#define PATH_MAX_LEN 100
Expand All @@ -40,6 +41,7 @@ typedef struct http_request_trace_t {
u16 status;
connection_info_t conn __attribute__((aligned(8)));
s64 content_length;
error_event error;
tp_info_t tp;

pid_info pid;
Expand Down
11 changes: 6 additions & 5 deletions pkg/beyla/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,12 @@ var DefaultConfig = Config{
TTL: defaultMetricsTTL,
},
Traces: otel.TracesConfig{
Protocol: otel.ProtocolUnset,
TracesProtocol: otel.ProtocolUnset,
MaxQueueSize: 4096,
MaxExportBatchSize: 4096,
ReportersCacheLen: ReporterLRUSize,
Protocol: otel.ProtocolUnset,
TracesProtocol: otel.ProtocolUnset,
MaxQueueSize: 4096,
MaxExportBatchSize: 4096,
ReportersCacheLen: ReporterLRUSize,
ReportExceptionEvents: false,
Instrumentations: []string{
instrumentations.InstrumentationALL,
},
Expand Down
10 changes: 5 additions & 5 deletions pkg/export/alloy/traces.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ import (
func TracesReceiver(
ctx context.Context,
ctxInfo *global.ContextInfo,
cfg *beyla.TracesReceiverConfig,
cfg *beyla.Config,
userAttribSelection attributes.Selection,
) pipe.FinalProvider[[]request.Span] {
return (&tracesReceiver{ctx: ctx, cfg: cfg, attributes: userAttribSelection, hostID: ctxInfo.HostID}).provideLoop
}

type tracesReceiver struct {
ctx context.Context
cfg *beyla.TracesReceiverConfig
cfg *beyla.Config
attributes attributes.Selection
hostID string
}
Expand All @@ -35,7 +35,7 @@ func (tr *tracesReceiver) spanDiscarded(span *request.Span) bool {
}

func (tr *tracesReceiver) provideLoop() (pipe.FinalFunc[[]request.Span], error) {
if !tr.cfg.Enabled() {
if !tr.cfg.TracesReceiver.Enabled() {
return pipe.IgnoreFinal[[]request.Span](), nil
}
return func(in <-chan []request.Span) {
Expand All @@ -53,8 +53,8 @@ func (tr *tracesReceiver) provideLoop() (pipe.FinalFunc[[]request.Span], error)
}
envResourceAttrs := otel.ResourceAttrsFromEnv(&span.ServiceID)

for _, tc := range tr.cfg.Traces {
traces := otel.GenerateTraces(span, tr.hostID, traceAttrs, envResourceAttrs)
for _, tc := range tr.cfg.TracesReceiver.Traces {
traces := otel.GenerateTraces(tr.cfg.Traces, span, tr.hostID, traceAttrs, envResourceAttrs)
err := tc.ConsumeTraces(tr.ctx, traces)
if err != nil {
slog.Error("error sending trace to consumer", "error", err)
Expand Down
5 changes: 3 additions & 2 deletions pkg/export/alloy/traces_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func TestTracesSkipsInstrumented(t *testing.T) {
func makeTracesTestReceiver() *tracesReceiver {
return &tracesReceiver{
ctx: context.Background(),
cfg: &beyla.TracesReceiverConfig{},
cfg: &beyla.Config{},
attributes: attributes.Selection{},
hostID: "Alloy",
}
Expand All @@ -69,12 +69,13 @@ func generateTracesForSpans(t *testing.T, tr *tracesReceiver, spans []request.Sp
res := []ptrace.Traces{}
traceAttrs, err := otel.GetUserSelectedAttributes(tr.attributes)
assert.NoError(t, err)
cfg := otel.TracesConfig{}
for i := range spans {
span := &spans[i]
if tr.spanDiscarded(span) {
continue
}
res = append(res, otel.GenerateTraces(span, tr.hostID, traceAttrs, []attribute.KeyValue{}))
res = append(res, otel.GenerateTraces(cfg, span, tr.hostID, traceAttrs, []attribute.KeyValue{}))
}

return res
Expand Down
3 changes: 3 additions & 0 deletions pkg/export/debug/debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ func textPrinter(input <-chan []request.Span) {
spans[i].ServiceID.SDKLanguage.String(),
traceparent(&spans[i]),
)
if spans[i].ErrorMessage != "" {
fmt.Printf("error_message=%s stacktrace=\n%s\n", spans[i].ErrorMessage, spans[i].ErrorStacktrace)
}
}
}
}
Expand Down
20 changes: 16 additions & 4 deletions pkg/export/otel/traces.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ type TracesConfig struct {
// BackOffMaxElapsedTime is the maximum amount of time (including retries) spent trying to send a request/batch.
BackOffMaxElapsedTime time.Duration `yaml:"backoff_max_elapsed_time" env:"BEYLA_BACKOFF_MAX_ELAPSED_TIME"`

// ReportExceptionEvents enables the reporting of exception events.
ReportExceptionEvents bool `yaml:"report_exception_events" env:"BEYLA_TRACES_REPORT_EXCEPTION_EVENTS"`

ReportersCacheLen int `yaml:"reporters_cache_len" env:"BEYLA_TRACES_REPORT_CACHE_LEN"`

// SDKLogLevel works independently from the global LogLevel because it prints GBs of logs in Debug mode
Expand Down Expand Up @@ -195,7 +198,7 @@ func (tr *tracesOTELReceiver) processSpans(exp exporter.Traces, spans []request.
}

envResourceAttrs := ResourceAttrsFromEnv(&span.ServiceID)
traces := GenerateTracesWithAttributes(span, tr.ctxInfo.HostID, finalAttrs, envResourceAttrs)
traces := GenerateTracesWithAttributes(tr.cfg, span, tr.ctxInfo.HostID, finalAttrs, envResourceAttrs)
err := exp.ConsumeTraces(tr.ctx, traces)
if err != nil {
slog.Error("error sending trace to consumer", "error", err)
Expand Down Expand Up @@ -415,7 +418,7 @@ func traceAppResourceAttrs(hostID string, service *svc.ID) []attribute.KeyValue
return attrs
}

func GenerateTracesWithAttributes(span *request.Span, hostID string, attrs []attribute.KeyValue, envResourceAttrs []attribute.KeyValue) ptrace.Traces {
func GenerateTracesWithAttributes(cfg TracesConfig, span *request.Span, hostID string, attrs []attribute.KeyValue, envResourceAttrs []attribute.KeyValue) ptrace.Traces {
t := span.Timings()
start := spanStartTime(t)
hasSubSpans := t.Start.After(start)
Expand Down Expand Up @@ -457,6 +460,15 @@ func GenerateTracesWithAttributes(span *request.Span, hostID string, attrs []att
m := attrsToMap(attrs)
m.CopyTo(s.Attributes())

// Set error message and stacktrace
if cfg.ReportExceptionEvents && span.ErrorMessage != "" {
e := s.Events().AppendEmpty()
e.SetName(semconv.ExceptionEventName)
e.Attributes().PutStr(string(semconv.ExceptionMessageKey), span.ErrorMessage)
e.Attributes().PutStr(string(semconv.ExceptionTypeKey), "error")
e.Attributes().PutStr(string(semconv.ExceptionStacktraceKey), span.ErrorStacktrace)
}

// Set status code
statusCode := codeToStatusCode(request.SpanStatusCode(span))
s.Status().SetCode(statusCode)
Expand All @@ -465,8 +477,8 @@ func GenerateTracesWithAttributes(span *request.Span, hostID string, attrs []att
}

// GenerateTraces creates a ptrace.Traces from a request.Span
func GenerateTraces(span *request.Span, hostID string, userAttrs map[attr.Name]struct{}, envResourceAttrs []attribute.KeyValue) ptrace.Traces {
return GenerateTracesWithAttributes(span, hostID, traceAttributes(span, userAttrs), envResourceAttrs)
func GenerateTraces(cfg TracesConfig, span *request.Span, hostID string, userAttrs map[attr.Name]struct{}, envResourceAttrs []attribute.KeyValue) ptrace.Traces {
return GenerateTracesWithAttributes(cfg, span, hostID, traceAttributes(span, userAttrs), envResourceAttrs)
}

// createSubSpans creates the internal spans for a request.Span
Expand Down
Loading
Loading