Skip to content

Commit f202e7f

Browse files
Delyan KratunovKernel Patches Daemon
authored andcommitted
bpf: implement sleepable uprobes by chaining tasks_trace and normal rcu
uprobes work by raising a trap, setting a task flag from within the interrupt handler, and processing the actual work for the uprobe on the way back to userspace. As a result, uprobe handlers already execute in a user context. The primary obstacle to sleepable bpf uprobe programs is therefore on the bpf side. Namely, the bpf_prog_array attached to the uprobe is protected by normal rcu and runs with disabled preemption. In order for uprobe bpf programs to become actually sleepable, we need it to be protected by the tasks_trace rcu flavor instead (and kfree() called after a corresponding grace period). Based on Alexei's proposal, we change the free path for bpf_prog_array to chain a tasks_trace and normal grace periods one after the other. Users who iterate under tasks_trace read section would be safe, as would users who iterate under normal read sections (from non-sleepable locations). The downside is that we take the tasks_trace latency for all perf_event-attached bpf programs (and not just uprobe ones) but this is deemed safe given the possible attach rates for kprobe/uprobe/tp programs. Separately, non-sleepable programs need access to dynamically sized rcu-protected maps, so we conditionally disable preemption and take an rcu read section around them, in addition to the overarching tasks_trace section. Signed-off-by: Delyan Kratunov <delyank@fb.com> Reported-by: kernel test robot <lkp@intel.com> Reported-by: kernel test robot <lkp@intel.com>
1 parent b04e19b commit f202e7f

File tree

5 files changed

+99
-5
lines changed

5 files changed

+99
-5
lines changed

include/linux/bpf.h

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <linux/stddef.h>
2727
#include <linux/bpfptr.h>
2828
#include <linux/btf.h>
29+
#include <linux/rcupdate_trace.h>
2930

3031
struct bpf_verifier_env;
3132
struct bpf_verifier_log;
@@ -1343,6 +1344,8 @@ extern struct bpf_empty_prog_array bpf_empty_prog_array;
13431344

13441345
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
13451346
void bpf_prog_array_free(struct bpf_prog_array *progs);
1347+
/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */
1348+
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs);
13461349
int bpf_prog_array_length(struct bpf_prog_array *progs);
13471350
bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
13481351
int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
@@ -1428,6 +1431,60 @@ bpf_prog_run_array(const struct bpf_prog_array *array,
14281431
return ret;
14291432
}
14301433

1434+
/**
1435+
* Notes on RCU design for bpf_prog_arrays containing sleepable programs:
1436+
*
1437+
* We use the tasks_trace rcu flavor read section to protect the bpf_prog_array
1438+
* overall. As a result, we must use the bpf_prog_array_free_sleepable
1439+
* in order to use the tasks_trace rcu grace period.
1440+
*
1441+
* When a non-sleepable program is inside the array, we take the rcu read
1442+
* section and disable preemption for that program alone, so it can access
1443+
* rcu-protected dynamically sized maps.
1444+
*/
1445+
static __always_inline u32
1446+
bpf_prog_run_array_sleepable(const struct bpf_prog_array __rcu *array_rcu,
1447+
const void *ctx, bpf_prog_run_fn run_prog)
1448+
{
1449+
const struct bpf_prog_array_item *item;
1450+
const struct bpf_prog *prog;
1451+
const struct bpf_prog_array *array;
1452+
struct bpf_run_ctx *old_run_ctx;
1453+
struct bpf_trace_run_ctx run_ctx;
1454+
u32 ret = 1;
1455+
1456+
might_fault();
1457+
1458+
migrate_disable();
1459+
rcu_read_lock_trace();
1460+
1461+
array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held());
1462+
if (unlikely(!array))
1463+
goto out;
1464+
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
1465+
item = &array->items[0];
1466+
while ((prog = READ_ONCE(item->prog))) {
1467+
if (!prog->aux->sleepable) {
1468+
preempt_disable();
1469+
rcu_read_lock();
1470+
}
1471+
1472+
run_ctx.bpf_cookie = item->bpf_cookie;
1473+
ret &= run_prog(prog, ctx);
1474+
item++;
1475+
1476+
if (!prog->aux->sleepable) {
1477+
rcu_read_unlock();
1478+
preempt_enable();
1479+
}
1480+
}
1481+
bpf_reset_run_ctx(old_run_ctx);
1482+
out:
1483+
rcu_read_unlock_trace();
1484+
migrate_enable();
1485+
return ret;
1486+
}
1487+
14311488
#ifdef CONFIG_BPF_SYSCALL
14321489
DECLARE_PER_CPU(int, bpf_prog_active);
14331490
extern struct mutex bpf_stats_enabled_mutex;

include/linux/trace_events.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -736,6 +736,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
736736

737737
#ifdef CONFIG_BPF_EVENTS
738738
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
739+
unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx);
739740
int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
740741
void perf_event_detach_bpf_prog(struct perf_event *event);
741742
int perf_event_query_prog_array(struct perf_event *event, void __user *info);

kernel/bpf/core.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2268,6 +2268,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs)
22682268
kfree_rcu(progs, rcu);
22692269
}
22702270

2271+
static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
2272+
{
2273+
struct bpf_prog_array *progs;
2274+
2275+
progs = container_of(rcu, struct bpf_prog_array, rcu);
2276+
kfree_rcu(progs, rcu);
2277+
}
2278+
2279+
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
2280+
{
2281+
if (!progs || progs == &bpf_empty_prog_array.hdr)
2282+
return;
2283+
call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
2284+
}
2285+
22712286
int bpf_prog_array_length(struct bpf_prog_array *array)
22722287
{
22732288
struct bpf_prog_array_item *item;

kernel/trace/bpf_trace.c

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
140140
return ret;
141141
}
142142

143+
unsigned int uprobe_call_bpf(struct trace_event_call *call, void *ctx)
144+
{
145+
unsigned int ret;
146+
147+
/*
148+
* Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
149+
* to all call sites, we did a bpf_prog_array_valid() there to check
150+
* whether call->prog_array is empty or not, which is
151+
* a heuristic to speed up execution.
152+
*
153+
* If bpf_prog_array_valid() fetched prog_array was
154+
* non-NULL, we go into uprobe_call_bpf() and do the actual
155+
* proper rcu_dereference() under RCU trace lock.
156+
* If it turns out that prog_array is NULL then, we bail out.
157+
* For the opposite, if the bpf_prog_array_valid() fetched pointer
158+
* was NULL, you'll skip the prog_array with the risk of missing
159+
* out of events when it was updated in between this and the
160+
* rcu_dereference() which is accepted risk.
161+
*/
162+
ret = bpf_prog_run_array_sleepable(call->prog_array, ctx, bpf_prog_run);
163+
return ret;
164+
}
165+
143166
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
144167
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
145168
{
@@ -1915,7 +1938,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
19151938
event->prog = prog;
19161939
event->bpf_cookie = bpf_cookie;
19171940
rcu_assign_pointer(event->tp_event->prog_array, new_array);
1918-
bpf_prog_array_free(old_array);
1941+
bpf_prog_array_free_sleepable(old_array);
19191942

19201943
unlock:
19211944
mutex_unlock(&bpf_event_mutex);
@@ -1941,7 +1964,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
19411964
bpf_prog_array_delete_safe(old_array, event->prog);
19421965
} else {
19431966
rcu_assign_pointer(event->tp_event->prog_array, new_array);
1944-
bpf_prog_array_free(old_array);
1967+
bpf_prog_array_free_sleepable(old_array);
19451968
}
19461969

19471970
bpf_prog_put(event->prog);

kernel/trace/trace_uprobe.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1346,9 +1346,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
13461346
if (bpf_prog_array_valid(call)) {
13471347
u32 ret;
13481348

1349-
preempt_disable();
1350-
ret = trace_call_bpf(call, regs);
1351-
preempt_enable();
1349+
ret = uprobe_call_bpf(call, regs);
13521350
if (!ret)
13531351
return;
13541352
}

0 commit comments

Comments
 (0)