Skip to content

Commit 0fe4b38

Browse files
sinkapAlexei Starovoitov
authored andcommitted
bpf: Allow bpf_local_storage to be used by sleepable programs
Other maps like hashmaps are already available to sleepable programs. Sleepable BPF programs run under trace RCU. Allow task, sk and inode storage to be used from sleepable programs. This allows sleepable and non-sleepable programs to provide shareable annotations on kernel objects. Sleepable programs run in trace RCU where as non-sleepable programs run in a normal RCU critical section i.e. __bpf_prog_enter{_sleepable} and __bpf_prog_exit{_sleepable}) (rcu_read_lock or rcu_read_lock_trace). In order to make the local storage maps accessible to both sleepable and non-sleepable programs, one needs to call both call_rcu_tasks_trace and call_rcu to wait for both trace and classical RCU grace periods to expire before freeing memory. Paul's work on call_rcu_tasks_trace allows us to have per CPU queueing for call_rcu_tasks_trace. This behaviour can be achieved by setting rcupdate.rcu_task_enqueue_lim=<num_cpus> boot parameter. In light of these new performance changes and to keep the local storage code simple, avoid adding a new flag for sleepable maps / local storage to select the RCU synchronization (trace / classical). Also, update the dereferencing of the pointers to use rcu_derference_check (with either the trace or normal RCU locks held) with a common bpf_rcu_lock_held helper method. Signed-off-by: KP Singh <kpsingh@kernel.org> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Martin KaFai Lau <kafai@fb.com> Link: https://lore.kernel.org/bpf/20211224152916.1550677-2-kpsingh@kernel.org
1 parent 3ccdcee commit 0fe4b38

File tree

6 files changed

+62
-16
lines changed

6 files changed

+62
-16
lines changed

include/linux/bpf_local_storage.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717

1818
#define BPF_LOCAL_STORAGE_CACHE_SIZE 16
1919

20+
#define bpf_rcu_lock_held() \
21+
(rcu_read_lock_held() || rcu_read_lock_trace_held() || \
22+
rcu_read_lock_bh_held())
2023
struct bpf_local_storage_map_bucket {
2124
struct hlist_head list;
2225
raw_spinlock_t lock;
@@ -162,4 +165,6 @@ struct bpf_local_storage_data *
162165
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
163166
void *value, u64 map_flags);
164167

168+
void bpf_local_storage_free_rcu(struct rcu_head *rcu);
169+
165170
#endif /* _BPF_LOCAL_STORAGE_H */

kernel/bpf/bpf_inode_storage.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <linux/bpf_lsm.h>
1818
#include <linux/btf_ids.h>
1919
#include <linux/fdtable.h>
20+
#include <linux/rcupdate_trace.h>
2021

2122
DEFINE_BPF_STORAGE_CACHE(inode_cache);
2223

@@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
4445
if (!bsb)
4546
return NULL;
4647

47-
inode_storage = rcu_dereference(bsb->storage);
48+
inode_storage =
49+
rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
4850
if (!inode_storage)
4951
return NULL;
5052

@@ -172,6 +174,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
172174
{
173175
struct bpf_local_storage_data *sdata;
174176

177+
WARN_ON_ONCE(!bpf_rcu_lock_held());
175178
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
176179
return (unsigned long)NULL;
177180

@@ -204,6 +207,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
204207
BPF_CALL_2(bpf_inode_storage_delete,
205208
struct bpf_map *, map, struct inode *, inode)
206209
{
210+
WARN_ON_ONCE(!bpf_rcu_lock_held());
207211
if (!inode)
208212
return -EINVAL;
209213

kernel/bpf/bpf_local_storage.c

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
#include <net/sock.h>
1212
#include <uapi/linux/sock_diag.h>
1313
#include <uapi/linux/btf.h>
14+
#include <linux/rcupdate.h>
15+
#include <linux/rcupdate_trace.h>
16+
#include <linux/rcupdate_wait.h>
1417

1518
#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
1619

@@ -81,6 +84,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
8184
return NULL;
8285
}
8386

87+
void bpf_local_storage_free_rcu(struct rcu_head *rcu)
88+
{
89+
struct bpf_local_storage *local_storage;
90+
91+
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
92+
kfree_rcu(local_storage, rcu);
93+
}
94+
95+
static void bpf_selem_free_rcu(struct rcu_head *rcu)
96+
{
97+
struct bpf_local_storage_elem *selem;
98+
99+
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
100+
kfree_rcu(selem, rcu);
101+
}
102+
84103
/* local_storage->lock must be held and selem->local_storage == local_storage.
85104
* The caller must ensure selem->smap is still valid to be
86105
* dereferenced for its smap->elem_size and smap->cache_idx.
@@ -93,7 +112,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
93112
bool free_local_storage;
94113
void *owner;
95114

96-
smap = rcu_dereference(SDATA(selem)->smap);
115+
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
97116
owner = local_storage->owner;
98117

99118
/* All uncharging on the owner must be done first.
@@ -118,21 +137,20 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
118137
*
119138
* Although the unlock will be done under
120139
* rcu_read_lock(), it is more intutivie to
121-
* read if kfree_rcu(local_storage, rcu) is done
140+
* read if the freeing of the storage is done
122141
* after the raw_spin_unlock_bh(&local_storage->lock).
123142
*
124143
* Hence, a "bool free_local_storage" is returned
125-
* to the caller which then calls the kfree_rcu()
126-
* after unlock.
144+
* to the caller which then calls then frees the storage after
145+
* all the RCU grace periods have expired.
127146
*/
128147
}
129148
hlist_del_init_rcu(&selem->snode);
130149
if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
131150
SDATA(selem))
132151
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
133152

134-
kfree_rcu(selem, rcu);
135-
153+
call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
136154
return free_local_storage;
137155
}
138156

@@ -146,15 +164,17 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
146164
/* selem has already been unlinked from sk */
147165
return;
148166

149-
local_storage = rcu_dereference(selem->local_storage);
167+
local_storage = rcu_dereference_check(selem->local_storage,
168+
bpf_rcu_lock_held());
150169
raw_spin_lock_irqsave(&local_storage->lock, flags);
151170
if (likely(selem_linked_to_storage(selem)))
152171
free_local_storage = bpf_selem_unlink_storage_nolock(
153172
local_storage, selem, true);
154173
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
155174

156175
if (free_local_storage)
157-
kfree_rcu(local_storage, rcu);
176+
call_rcu_tasks_trace(&local_storage->rcu,
177+
bpf_local_storage_free_rcu);
158178
}
159179

160180
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -174,7 +194,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
174194
/* selem has already be unlinked from smap */
175195
return;
176196

177-
smap = rcu_dereference(SDATA(selem)->smap);
197+
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
178198
b = select_bucket(smap, selem);
179199
raw_spin_lock_irqsave(&b->lock, flags);
180200
if (likely(selem_linked_to_map(selem)))
@@ -213,12 +233,14 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
213233
struct bpf_local_storage_elem *selem;
214234

215235
/* Fast path (cache hit) */
216-
sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
236+
sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
237+
bpf_rcu_lock_held());
217238
if (sdata && rcu_access_pointer(sdata->smap) == smap)
218239
return sdata;
219240

220241
/* Slow path (cache miss) */
221-
hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
242+
hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
243+
rcu_read_lock_trace_held())
222244
if (rcu_access_pointer(SDATA(selem)->smap) == smap)
223245
break;
224246

@@ -306,7 +328,8 @@ int bpf_local_storage_alloc(void *owner,
306328
* bucket->list, first_selem can be freed immediately
307329
* (instead of kfree_rcu) because
308330
* bpf_local_storage_map_free() does a
309-
* synchronize_rcu() before walking the bucket->list.
331+
* synchronize_rcu_mult (waiting for both sleepable and
332+
* normal programs) before walking the bucket->list.
310333
* Hence, no one is accessing selem from the
311334
* bucket->list under rcu_read_lock().
312335
*/
@@ -342,7 +365,8 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
342365
!map_value_has_spin_lock(&smap->map)))
343366
return ERR_PTR(-EINVAL);
344367

345-
local_storage = rcu_dereference(*owner_storage(smap, owner));
368+
local_storage = rcu_dereference_check(*owner_storage(smap, owner),
369+
bpf_rcu_lock_held());
346370
if (!local_storage || hlist_empty(&local_storage->list)) {
347371
/* Very first elem for the owner */
348372
err = check_flags(NULL, map_flags);

kernel/bpf/bpf_task_storage.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <uapi/linux/btf.h>
1818
#include <linux/btf_ids.h>
1919
#include <linux/fdtable.h>
20+
#include <linux/rcupdate_trace.h>
2021

2122
DEFINE_BPF_STORAGE_CACHE(task_cache);
2223

@@ -59,7 +60,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
5960
struct bpf_local_storage *task_storage;
6061
struct bpf_local_storage_map *smap;
6162

62-
task_storage = rcu_dereference(task->bpf_storage);
63+
task_storage =
64+
rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held());
6365
if (!task_storage)
6466
return NULL;
6567

@@ -229,6 +231,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
229231
{
230232
struct bpf_local_storage_data *sdata;
231233

234+
WARN_ON_ONCE(!bpf_rcu_lock_held());
232235
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
233236
return (unsigned long)NULL;
234237

@@ -260,6 +263,7 @@ BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
260263
{
261264
int ret;
262265

266+
WARN_ON_ONCE(!bpf_rcu_lock_held());
263267
if (!task)
264268
return -EINVAL;
265269

kernel/bpf/verifier.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11874,6 +11874,9 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
1187411874
}
1187511875
break;
1187611876
case BPF_MAP_TYPE_RINGBUF:
11877+
case BPF_MAP_TYPE_INODE_STORAGE:
11878+
case BPF_MAP_TYPE_SK_STORAGE:
11879+
case BPF_MAP_TYPE_TASK_STORAGE:
1187711880
break;
1187811881
default:
1187911882
verbose(env,

net/core/bpf_sk_storage.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <net/sock.h>
1414
#include <uapi/linux/sock_diag.h>
1515
#include <uapi/linux/btf.h>
16+
#include <linux/rcupdate_trace.h>
1617

1718
DEFINE_BPF_STORAGE_CACHE(sk_cache);
1819

@@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
2223
struct bpf_local_storage *sk_storage;
2324
struct bpf_local_storage_map *smap;
2425

25-
sk_storage = rcu_dereference(sk->sk_bpf_storage);
26+
sk_storage =
27+
rcu_dereference_check(sk->sk_bpf_storage, bpf_rcu_lock_held());
2628
if (!sk_storage)
2729
return NULL;
2830

@@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
258260
{
259261
struct bpf_local_storage_data *sdata;
260262

263+
WARN_ON_ONCE(!bpf_rcu_lock_held());
261264
if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
262265
return (unsigned long)NULL;
263266

@@ -288,6 +291,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
288291

289292
BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
290293
{
294+
WARN_ON_ONCE(!bpf_rcu_lock_held());
291295
if (!sk || !sk_fullsock(sk))
292296
return -EINVAL;
293297

@@ -416,6 +420,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
416420
BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
417421
void *, value, u64, flags)
418422
{
423+
WARN_ON_ONCE(!bpf_rcu_lock_held());
419424
if (in_hardirq() || in_nmi())
420425
return (unsigned long)NULL;
421426

@@ -425,6 +430,7 @@ BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
425430
BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
426431
struct sock *, sk)
427432
{
433+
WARN_ON_ONCE(!bpf_rcu_lock_held());
428434
if (in_hardirq() || in_nmi())
429435
return -EPERM;
430436

0 commit comments

Comments
 (0)