-
Notifications
You must be signed in to change notification settings - Fork 2
/
handler.c
293 lines (252 loc) · 9.01 KB
/
handler.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
#include "vmlinux.h"
#include "bpf_core_read.h"
#include "bpf_helpers.h"
#include "vmlinux_core.h"
// This license needs to be GPL-compatible because the BTF verifier won't let us
// use many BPF helpers (including `bpf_probe_read_*`).
u8 __license[] SEC("license") = "Dual MIT/GPL"; // NOLINT
// Adds some extra log entries that are usually spam when deployed in the real
// world.
//#define DEBUG
// These constants must be kept in sync with Go.
#define ARGLEN 32 // maximum amount of args in argv we'll copy
#define ARGSIZE 1024 // maximum byte length of each arg in argv we'll copy
#define LOGFMTSIZE 1024 // maximum length of log fmt str sent back to userspace
#define LOGARGLEN 3 // maximum amount of fmt arguments to a log entry
// Maximum levels of PID namespace nesting. PID namespaces have a hierarchy
// limit of 32 since kernel 3.7.
#define MAX_PIDNS_HIERARCHY 32
// This struct is defined according to
// /sys/kernel/debug/tracing/events/syscalls/sys_enter_execve/format
struct exec_info {
u16 common_type; // offset=0, size=2
u8 common_flags; // offset=2, size=1
u8 common_preempt_count; // offset=3, size=1
s32 common_pid; // offset=4, size=4
s32 syscall_nr; // offset=8, size=4
u32 pad; // offset=12, size=4 (pad)
const u8 *filename; // offset=16, size=8 (ptr)
const u8 *const *argv; // offset=24, size=8 (ptr)
const u8 *const *envp; // offset=32, size=8 (ptr)
};
// The event struct. This struct must be kept in sync with the Golang
// counterpart.
struct event_t {
// Details about the process being launched.
u8 filename[ARGSIZE];
u8 argv[ARGLEN][ARGSIZE];
u32 argc; // set to ARGLEN + 1 if there were more than ARGLEN arguments
u32 uid;
u32 gid;
u32 pid;
// Name of the calling process.
u8 comm[ARGSIZE];
};
static struct event_t zero_event SEC(".rodata") = {
.filename = {0},
.argv = {},
.argc = 0,
.uid = 0,
.gid = 0,
.pid = 0,
.comm = {0},
};
// Log entry from eBPF to userspace. This struct must be kept in sync with the
// Golang counterpart.
struct log_entry_t {
u32 uid;
u32 gid;
u32 pid;
// fmt contains a format string that only contains "%d" and "%u" directives.
// In userspace we will replace these with the arguments in `args`.
u8 fmt[LOGFMTSIZE];
// These are communicated back to userspace as unsigned 32-bit integers, but
// depending on the format string, they could be treated as signed or
// unsigned.
u32 args[LOGARGLEN];
};
static struct log_entry_t zero_log SEC(".rodata") = {
.fmt = {0},
.args = {},
};
// This is the ring buffer we'll output events data to. The Go program reads
// from this ring buffer and reads the data into a Go struct for easy usage.
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 1 << 24);
} events SEC(".maps");
// The ring buffer we will output log entries to.
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 1 << 24);
} logs SEC(".maps");
// The map we'll use to retrieve the configuration about the given filters.
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(u32));
__uint(max_entries, 1);
} filters SEC(".maps");
// Indexes in the `filters` map for each configuration option.
static u32 filter_pidns_idx SEC(".rodata") = 0;
// LOG[N] calls log() with the unused parameters zeroed out. `N` is the amount
// of fmt args you want to use.
#define LOG0(fmt) LOG3(fmt, 0, 0, 0)
#define LOG1(fmt, arg0) LOG3(fmt, arg0, 0, 0)
#define LOG2(fmt, arg0, arg1) LOG3(fmt, arg0, arg1, 0)
#define LOG3(fmt, arg0, arg1, arg2) log(fmt, sizeof(fmt), arg0, arg1, arg2)
// log logs to bpf_trace_printk() and sends the formatted log string to the logs
// ringbuf. Call LOG[N]() instead of calling this directly.
static void log(const char *fmt, u32 fmt_size, u32 arg0, u32 arg1, u32 arg2) {
bpf_trace_printk(fmt, fmt_size, arg0, arg1, arg2);
struct log_entry_t *entry;
entry = bpf_ringbuf_reserve(&logs, sizeof(struct log_entry_t), 0);
if (!entry) {
bpf_printk("could not reserve logs ringbuf memory");
return;
}
// Zero out the log entry for safety. If we don't do this, we risk sending
// random kernel memory back to userspace.
s32 ret = bpf_probe_read_kernel(entry, sizeof(struct log_entry_t), &zero_log);
if (ret < 0) {
bpf_printk("zero out log: %d", ret);
bpf_ringbuf_discard(entry, 0);
return;
}
// Copy the fmt string into the log entry.
// NOTE: bpf_snprintf is not supported in some of the lower kernel versions
// we claim to support, so we have to do it this way.
ret = bpf_probe_read_kernel_str(&entry->fmt, sizeof(entry->fmt), fmt);
if (ret < 0) {
bpf_printk("could not read fmt into log struct: %d", ret);
bpf_ringbuf_discard(entry, 0);
return;
}
entry->uid = bpf_get_current_uid_gid();
entry->gid = bpf_get_current_uid_gid() >> 32; // NOLINT(readability-magic-numbers)
entry->pid = bpf_get_current_pid_tgid();
entry->args[0] = arg0;
entry->args[1] = arg1;
entry->args[2] = arg2;
bpf_ringbuf_submit(entry, 0);
}
// filter_pidns checks if the current task is in a PID namespace equal to or
// under the given target_pidns. Returns a 0 if successful, or a negative error
// on failure.
s32 filter_pidns(u32 target_pidns) {
struct task_struct___exectrace *task = (void *)bpf_get_current_task(); // NOLINT(performance-no-int-to-ptr)
struct pid_namespace___exectrace *pidns;
s32 ret = BPF_CORE_READ_INTO(&pidns, task, nsproxy, pid_ns_for_children);
if (ret) {
LOG1("could not read current task pidns: %d", ret);
return ret;
}
// Iterate up the PID NS tree until we either find the net namespace we're
// filtering for, or until there are no more parent namespaces.
u32 inum;
u32 i = 0;
for (; i < MAX_PIDNS_HIERARCHY; i++) {
if (i != 0) {
ret = BPF_CORE_READ_INTO(&pidns, pidns, parent);
if (ret) {
LOG2("could not read parent pidns on iteration %u: %d", i, ret);
return ret;
}
}
if (!pidns) {
#ifdef DEBUG
LOG1("no more pidns after %u iterations", i);
#endif
return -1;
}
ret = BPF_CORE_READ_INTO(&inum, pidns, ns.inum);
if (ret) {
LOG2("could not read pidns common on iteration %u: %d", i, ret);
return ret;
}
#ifdef DEBUG
LOG3("got pidns on iteration %u: %u (target=%u)", i, inum, target_pidns);
#endif
if (inum == target_pidns) {
// One of the parent PID namespaces was the target PID namespace.
return 0;
}
}
// Iterated through all 32 parent PID namespaces and couldn't find what we
// were looking for.
#ifdef DEBUG
LOG1("does not match pidns filter after %u iterations", i);
#endif
return -1;
}
// Tracepoint at the top of execve() syscall.
SEC("tracepoint/syscalls/sys_enter_execve")
s32 enter_execve(struct exec_info *ctx) {
u32 *target_pidns = bpf_map_lookup_elem(&filters, &filter_pidns_idx);
if (target_pidns && *target_pidns && filter_pidns(*target_pidns)) {
return 1;
}
// Reserve memory for our event on the `events` ring buffer defined above.
struct event_t *event;
event = bpf_ringbuf_reserve(&events, sizeof(struct event_t), 0);
if (!event) {
LOG0("could not reserve events ringbuf memory");
return 1;
}
// Zero out the event for safety. If we don't do this, we risk sending
// random kernel memory back to userspace.
s32 ret = bpf_probe_read_kernel(event, sizeof(event), &zero_event);
if (ret) {
LOG1("zero out event: %d", ret);
bpf_ringbuf_discard(event, 0);
return 1;
}
// Store process/calling process details.
event->uid = bpf_get_current_uid_gid();
event->gid = bpf_get_current_uid_gid() >> 32; // NOLINT(readability-magic-numbers)
event->pid = bpf_get_current_pid_tgid();
ret = bpf_get_current_comm(&event->comm, sizeof(event->comm));
if (ret) {
LOG1("could not get current comm: %d", ret);
bpf_ringbuf_discard(event, 0);
return 1;
}
// Write the filename in addition to argv[0] because the filename contains
// the full path to the file which could be more useful in some situations.
ret = bpf_probe_read_user_str(&event->filename, sizeof(event->filename), ctx->filename);
if (ret < 0) {
LOG1("could not read filename into event struct: %d", ret);
bpf_ringbuf_discard(event, 0);
return 1;
}
// Copy everything from ctx->argv to event->argv, incrementing event->argc
// as we go.
for (u32 i = 0; i < ARGLEN; i++) {
if (!(&ctx->argv[i])) {
goto out;
}
// Copying the arg into it's own variable before copying it into
// event->argv[i] prevents memory corruption.
const u8 *argp = NULL;
ret = bpf_probe_read_user(&argp, sizeof(argp), &ctx->argv[i]);
if (ret || !argp) {
goto out;
}
// Copy argp to event->argv[i].
ret = bpf_probe_read_user_str(event->argv[i], sizeof(event->argv[i]), argp);
if (ret < 0) {
LOG2("read argv %u: %d", i, ret);
goto out;
}
event->argc++;
}
// This won't get hit if we `goto out` in the loop above. This is to signify
// to userspace that we couldn't copy all of the arguments because it
// exceeded ARGLEN.
event->argc++;
out:
// Write the event to the ring buffer and notify userspace. This will cause
// the `Read()` call in userspace to return if it was blocked.
bpf_ringbuf_submit(event, 0);
return 0;
}