Skip to content

Commit 1e6c62a

Browse files
Alexei Starovoitovborkmann
authored andcommitted
bpf: Introduce sleepable BPF programs
Introduce sleepable BPF programs that can request such property for themselves via BPF_F_SLEEPABLE flag at program load time. In such case they will be able to use helpers like bpf_copy_from_user() that might sleep. At present only fentry/fexit/fmod_ret and lsm programs can request to be sleepable and only when they are attached to kernel functions that are known to allow sleeping. The non-sleepable programs are relying on implicit rcu_read_lock() and migrate_disable() to protect life time of programs, maps that they use and per-cpu kernel structures used to pass info between bpf programs and the kernel. The sleepable programs cannot be enclosed into rcu_read_lock(). migrate_disable() maps to preempt_disable() in non-RT kernels, so the progs should not be enclosed in migrate_disable() as well. Therefore rcu_read_lock_trace is used to protect the life time of sleepable progs. There are many networking and tracing program types. In many cases the 'struct bpf_prog *' pointer itself is rcu protected within some other kernel data structure and the kernel code is using rcu_dereference() to load that program pointer and call BPF_PROG_RUN() on it. All these cases are not touched. Instead sleepable bpf programs are allowed with bpf trampoline only. The program pointers are hard-coded into generated assembly of bpf trampoline and synchronize_rcu_tasks_trace() is used to protect the life time of the program. The same trampoline can hold both sleepable and non-sleepable progs. When rcu_read_lock_trace is held it means that some sleepable bpf program is running from bpf trampoline. Those programs can use bpf arrays and preallocated hash/lru maps. These map types are waiting on programs to complete via synchronize_rcu_tasks_trace(); Updates to trampoline now has to do synchronize_rcu_tasks_trace() and synchronize_rcu_tasks() to wait for sleepable progs to finish and for trampoline assembly to finish. This is the first step of introducing sleepable progs. Eventually dynamically allocated hash maps can be allowed and networking program types can become sleepable too. Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: Josef Bacik <josef@toxicpanda.com> Acked-by: Andrii Nakryiko <andriin@fb.com> Acked-by: KP Singh <kpsingh@google.com> Link: https://lore.kernel.org/bpf/20200827220114.69225-3-alexei.starovoitov@gmail.com
1 parent 76cd617 commit 1e6c62a

File tree

10 files changed

+162
-25
lines changed

10 files changed

+162
-25
lines changed

arch/x86/net/bpf_jit_comp.c

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1379,10 +1379,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
13791379
u8 *prog = *pprog;
13801380
int cnt = 0;
13811381

1382-
if (emit_call(&prog, __bpf_prog_enter, prog))
1383-
return -EINVAL;
1384-
/* remember prog start time returned by __bpf_prog_enter */
1385-
emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
1382+
if (p->aux->sleepable) {
1383+
if (emit_call(&prog, __bpf_prog_enter_sleepable, prog))
1384+
return -EINVAL;
1385+
} else {
1386+
if (emit_call(&prog, __bpf_prog_enter, prog))
1387+
return -EINVAL;
1388+
/* remember prog start time returned by __bpf_prog_enter */
1389+
emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
1390+
}
13861391

13871392
/* arg1: lea rdi, [rbp - stack_size] */
13881393
EMIT4(0x48, 0x8D, 0x7D, -stack_size);
@@ -1402,13 +1407,18 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
14021407
if (mod_ret)
14031408
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
14041409

1405-
/* arg1: mov rdi, progs[i] */
1406-
emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
1407-
(u32) (long) p);
1408-
/* arg2: mov rsi, rbx <- start time in nsec */
1409-
emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
1410-
if (emit_call(&prog, __bpf_prog_exit, prog))
1411-
return -EINVAL;
1410+
if (p->aux->sleepable) {
1411+
if (emit_call(&prog, __bpf_prog_exit_sleepable, prog))
1412+
return -EINVAL;
1413+
} else {
1414+
/* arg1: mov rdi, progs[i] */
1415+
emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
1416+
(u32) (long) p);
1417+
/* arg2: mov rsi, rbx <- start time in nsec */
1418+
emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
1419+
if (emit_call(&prog, __bpf_prog_exit, prog))
1420+
return -EINVAL;
1421+
}
14121422

14131423
*pprog = prog;
14141424
return 0;

include/linux/bpf.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
539539
/* these two functions are called from generated trampoline */
540540
u64 notrace __bpf_prog_enter(void);
541541
void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
542+
void notrace __bpf_prog_enter_sleepable(void);
543+
void notrace __bpf_prog_exit_sleepable(void);
542544

543545
struct bpf_ksym {
544546
unsigned long start;
@@ -734,6 +736,7 @@ struct bpf_prog_aux {
734736
bool offload_requested;
735737
bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
736738
bool func_proto_unreliable;
739+
bool sleepable;
737740
enum bpf_tramp_prog_type trampoline_prog_type;
738741
struct bpf_trampoline *trampoline;
739742
struct hlist_node tramp_hlist;

include/uapi/linux/bpf.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,14 @@ enum bpf_link_type {
346346
/* The verifier internal test flag. Behavior is undefined */
347347
#define BPF_F_TEST_STATE_FREQ (1U << 3)
348348

349+
/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will
350+
* restrict map and helper usage for such programs. Sleepable BPF programs can
351+
* only be attached to hooks where kernel execution context allows sleeping.
352+
* Such programs are allowed to use helpers that may sleep like
353+
* bpf_copy_from_user().
354+
*/
355+
#define BPF_F_SLEEPABLE (1U << 4)
356+
349357
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
350358
* two extensions:
351359
*

init/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1691,6 +1691,7 @@ config BPF_SYSCALL
16911691
bool "Enable bpf() system call"
16921692
select BPF
16931693
select IRQ_WORK
1694+
select TASKS_TRACE_RCU
16941695
default n
16951696
help
16961697
Enable the bpf() system call that allows to manipulate eBPF

kernel/bpf/arraymap.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <linux/filter.h>
1111
#include <linux/perf_event.h>
1212
#include <uapi/linux/btf.h>
13+
#include <linux/rcupdate_trace.h>
1314

1415
#include "map_in_map.h"
1516

kernel/bpf/hashtab.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <linux/rculist_nulls.h>
1010
#include <linux/random.h>
1111
#include <uapi/linux/btf.h>
12+
#include <linux/rcupdate_trace.h>
1213
#include "percpu_freelist.h"
1314
#include "bpf_lru_list.h"
1415
#include "map_in_map.h"
@@ -577,8 +578,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
577578
struct htab_elem *l;
578579
u32 hash, key_size;
579580

580-
/* Must be called with rcu_read_lock. */
581-
WARN_ON_ONCE(!rcu_read_lock_held());
581+
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
582582

583583
key_size = map->key_size;
584584

@@ -941,7 +941,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
941941
/* unknown flags */
942942
return -EINVAL;
943943

944-
WARN_ON_ONCE(!rcu_read_lock_held());
944+
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
945945

946946
key_size = map->key_size;
947947

@@ -1032,7 +1032,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
10321032
/* unknown flags */
10331033
return -EINVAL;
10341034

1035-
WARN_ON_ONCE(!rcu_read_lock_held());
1035+
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
10361036

10371037
key_size = map->key_size;
10381038

@@ -1220,7 +1220,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
12201220
u32 hash, key_size;
12211221
int ret = -ENOENT;
12221222

1223-
WARN_ON_ONCE(!rcu_read_lock_held());
1223+
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
12241224

12251225
key_size = map->key_size;
12261226

@@ -1252,7 +1252,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
12521252
u32 hash, key_size;
12531253
int ret = -ENOENT;
12541254

1255-
WARN_ON_ONCE(!rcu_read_lock_held());
1255+
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held());
12561256

12571257
key_size = map->key_size;
12581258

kernel/bpf/syscall.c

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <linux/bpf_lsm.h>
3030
#include <linux/poll.h>
3131
#include <linux/bpf-netns.h>
32+
#include <linux/rcupdate_trace.h>
3233

3334
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
3435
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -1731,10 +1732,14 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
17311732
btf_put(prog->aux->btf);
17321733
bpf_prog_free_linfo(prog);
17331734

1734-
if (deferred)
1735-
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
1736-
else
1735+
if (deferred) {
1736+
if (prog->aux->sleepable)
1737+
call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
1738+
else
1739+
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
1740+
} else {
17371741
__bpf_prog_put_rcu(&prog->aux->rcu);
1742+
}
17381743
}
17391744

17401745
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
@@ -2104,6 +2109,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
21042109
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
21052110
BPF_F_ANY_ALIGNMENT |
21062111
BPF_F_TEST_STATE_FREQ |
2112+
BPF_F_SLEEPABLE |
21072113
BPF_F_TEST_RND_HI32))
21082114
return -EINVAL;
21092115

@@ -2159,6 +2165,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
21592165
}
21602166

21612167
prog->aux->offload_requested = !!attr->prog_ifindex;
2168+
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
21622169

21632170
err = security_bpf_prog_alloc(prog->aux);
21642171
if (err)

kernel/bpf/trampoline.c

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
#include <linux/rbtree_latch.h>
88
#include <linux/perf_event.h>
99
#include <linux/btf.h>
10+
#include <linux/rcupdate_trace.h>
11+
#include <linux/rcupdate_wait.h>
1012

1113
/* dummy _ops. The verifier will operate on target program's ops. */
1214
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -210,9 +212,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
210212
* updates to trampoline would change the code from underneath the
211213
* preempted task. Hence wait for tasks to voluntarily schedule or go
212214
* to userspace.
215+
* The same trampoline can hold both sleepable and non-sleepable progs.
216+
* synchronize_rcu_tasks_trace() is needed to make sure all sleepable
217+
* programs finish executing.
218+
* Wait for these two grace periods together.
213219
*/
214-
215-
synchronize_rcu_tasks();
220+
synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace);
216221

217222
err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
218223
&tr->func.model, flags, tprogs,
@@ -344,7 +349,14 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
344349
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
345350
goto out;
346351
bpf_image_ksym_del(&tr->ksym);
347-
/* wait for tasks to get out of trampoline before freeing it */
352+
/* This code will be executed when all bpf progs (both sleepable and
353+
* non-sleepable) went through
354+
* bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred().
355+
* Hence no need for another synchronize_rcu_tasks_trace() here,
356+
* but synchronize_rcu_tasks() is still needed, since trampoline
357+
* may not have had any sleepable programs and we need to wait
358+
* for tasks to get out of trampoline code before freeing it.
359+
*/
348360
synchronize_rcu_tasks();
349361
bpf_jit_free_exec(tr->image);
350362
hlist_del(&tr->hlist);
@@ -394,6 +406,16 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
394406
rcu_read_unlock();
395407
}
396408

409+
void notrace __bpf_prog_enter_sleepable(void)
410+
{
411+
rcu_read_lock_trace();
412+
}
413+
414+
void notrace __bpf_prog_exit_sleepable(void)
415+
{
416+
rcu_read_unlock_trace();
417+
}
418+
397419
int __weak
398420
arch_prepare_bpf_trampoline(void *image, void *image_end,
399421
const struct btf_func_model *m, u32 flags,

kernel/bpf/verifier.c

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <linux/ctype.h>
2222
#include <linux/error-injection.h>
2323
#include <linux/bpf_lsm.h>
24+
#include <linux/btf_ids.h>
2425

2526
#include "disasm.h"
2627

@@ -9367,6 +9368,23 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
93679368
return -EINVAL;
93689369
}
93699370

9371+
if (prog->aux->sleepable)
9372+
switch (map->map_type) {
9373+
case BPF_MAP_TYPE_HASH:
9374+
case BPF_MAP_TYPE_LRU_HASH:
9375+
case BPF_MAP_TYPE_ARRAY:
9376+
if (!is_preallocated_map(map)) {
9377+
verbose(env,
9378+
"Sleepable programs can only use preallocated hash maps\n");
9379+
return -EINVAL;
9380+
}
9381+
break;
9382+
default:
9383+
verbose(env,
9384+
"Sleepable programs can only use array and hash maps\n");
9385+
return -EINVAL;
9386+
}
9387+
93709388
return 0;
93719389
}
93729390

@@ -10985,6 +11003,36 @@ static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr)
1098511003
return -EINVAL;
1098611004
}
1098711005

11006+
/* non exhaustive list of sleepable bpf_lsm_*() functions */
11007+
BTF_SET_START(btf_sleepable_lsm_hooks)
11008+
#ifdef CONFIG_BPF_LSM
11009+
BTF_ID(func, bpf_lsm_file_mprotect)
11010+
BTF_ID(func, bpf_lsm_bprm_committed_creds)
11011+
#endif
11012+
BTF_SET_END(btf_sleepable_lsm_hooks)
11013+
11014+
static int check_sleepable_lsm_hook(u32 btf_id)
11015+
{
11016+
return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id);
11017+
}
11018+
11019+
/* list of non-sleepable functions that are otherwise on
11020+
* ALLOW_ERROR_INJECTION list
11021+
*/
11022+
BTF_SET_START(btf_non_sleepable_error_inject)
11023+
/* Three functions below can be called from sleepable and non-sleepable context.
11024+
* Assume non-sleepable from bpf safety point of view.
11025+
*/
11026+
BTF_ID(func, __add_to_page_cache_locked)
11027+
BTF_ID(func, should_fail_alloc_page)
11028+
BTF_ID(func, should_failslab)
11029+
BTF_SET_END(btf_non_sleepable_error_inject)
11030+
11031+
static int check_non_sleepable_error_inject(u32 btf_id)
11032+
{
11033+
return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
11034+
}
11035+
1098811036
static int check_attach_btf_id(struct bpf_verifier_env *env)
1098911037
{
1099011038
struct bpf_prog *prog = env->prog;
@@ -11002,6 +11050,12 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
1100211050
long addr;
1100311051
u64 key;
1100411052

11053+
if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
11054+
prog->type != BPF_PROG_TYPE_LSM) {
11055+
verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
11056+
return -EINVAL;
11057+
}
11058+
1100511059
if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
1100611060
return check_struct_ops_btf_id(env);
1100711061

@@ -11210,13 +11264,36 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
1121011264
}
1121111265
}
1121211266

11213-
if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
11267+
if (prog->aux->sleepable) {
11268+
ret = -EINVAL;
11269+
switch (prog->type) {
11270+
case BPF_PROG_TYPE_TRACING:
11271+
/* fentry/fexit/fmod_ret progs can be sleepable only if they are
11272+
* attached to ALLOW_ERROR_INJECTION and are not in denylist.
11273+
*/
11274+
if (!check_non_sleepable_error_inject(btf_id) &&
11275+
within_error_injection_list(addr))
11276+
ret = 0;
11277+
break;
11278+
case BPF_PROG_TYPE_LSM:
11279+
/* LSM progs check that they are attached to bpf_lsm_*() funcs.
11280+
* Only some of them are sleepable.
11281+
*/
11282+
if (check_sleepable_lsm_hook(btf_id))
11283+
ret = 0;
11284+
break;
11285+
default:
11286+
break;
11287+
}
11288+
if (ret)
11289+
verbose(env, "%s is not sleepable\n",
11290+
prog->aux->attach_func_name);
11291+
} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
1121411292
ret = check_attach_modify_return(prog, addr);
1121511293
if (ret)
1121611294
verbose(env, "%s() is not modifiable\n",
1121711295
prog->aux->attach_func_name);
1121811296
}
11219-
1122011297
if (ret)
1122111298
goto out;
1122211299
tr->func.addr = (void *)addr;

tools/include/uapi/linux/bpf.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,14 @@ enum bpf_link_type {
346346
/* The verifier internal test flag. Behavior is undefined */
347347
#define BPF_F_TEST_STATE_FREQ (1U << 3)
348348

349+
/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will
350+
* restrict map and helper usage for such programs. Sleepable BPF programs can
351+
* only be attached to hooks where kernel execution context allows sleeping.
352+
* Such programs are allowed to use helpers that may sleep like
353+
* bpf_copy_from_user().
354+
*/
355+
#define BPF_F_SLEEPABLE (1U << 4)
356+
349357
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
350358
* two extensions:
351359
*

0 commit comments

Comments
 (0)