Skip to content

Commit

Permalink
Merge tag 'bpf-fixes' of git://git.kernel.org/pub/scm/linux/kernel/gi…
Browse files Browse the repository at this point in the history
…t/bpf/bpf

Pull bpf fixes from Daniel Borkmann:

 - Fix BPF verifier to force a checkpoint when the program's jump
   history becomes too long (Eduard Zingerman)

 - Add several fixes to the BPF bits iterator addressing issues like
   memory leaks and overflow problems (Hou Tao)

 - Fix an out-of-bounds write in trie_get_next_key (Byeonguk Jeong)

 - Fix BPF test infra's LIVE_FRAME frame update after a page has been
   recycled (Toke Høiland-Jørgensen)

 - Fix BPF verifier and undo the 40-bytes extra stack space for
   bpf_fastcall patterns due to various bugs (Eduard Zingerman)

 - Fix a BPF sockmap race condition which could trigger a NULL pointer
   dereference in sock_map_link_update_prog (Cong Wang)

 - Fix tcp_bpf_recvmsg_parser to retrieve seq_copied from tcp_sk under
   the socket lock (Jiayuan Chen)

* tag 'bpf-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
  bpf, test_run: Fix LIVE_FRAME frame update after a page has been recycled
  selftests/bpf: Add three test cases for bits_iter
  bpf: Use __u64 to save the bits in bits iterator
  bpf: Check the validity of nr_words in bpf_iter_bits_new()
  bpf: Add bpf_mem_alloc_check_size() helper
  bpf: Free dynamically allocated bits in bpf_iter_bits_destroy()
  bpf: disallow 40-bytes extra stack for bpf_fastcall patterns
  selftests/bpf: Add test for trie_get_next_key()
  bpf: Fix out-of-bounds write in trie_get_next_key()
  selftests/bpf: Test with a very short loop
  bpf: Force checkpoint when jmp history is too long
  bpf: fix filed access without lock
  sock_map: fix a NULL pointer dereference in sock_map_link_update_prog()
  • Loading branch information
torvalds committed Nov 1, 2024
2 parents 90602c2 + c40dd8c commit 5635f18
Show file tree
Hide file tree
Showing 13 changed files with 269 additions and 88 deletions.
3 changes: 3 additions & 0 deletions include/linux/bpf_mem_alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);

/* Check the allocation size for kmalloc equivalent allocator */
int bpf_mem_alloc_check_size(bool percpu, size_t size);

/* kmalloc/kfree equivalent: */
void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
Expand Down
54 changes: 44 additions & 10 deletions kernel/bpf/helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -2851,21 +2851,47 @@ struct bpf_iter_bits {
__u64 __opaque[2];
} __aligned(8);

#define BITS_ITER_NR_WORDS_MAX 511

struct bpf_iter_bits_kern {
union {
unsigned long *bits;
unsigned long bits_copy;
__u64 *bits;
__u64 bits_copy;
};
u32 nr_bits;
int nr_bits;
int bit;
} __aligned(8);

/* On 64-bit hosts, unsigned long and u64 have the same size, so passing
* a u64 pointer and an unsigned long pointer to find_next_bit() will
* return the same result, as both point to the same 8-byte area.
*
* For 32-bit little-endian hosts, using a u64 pointer or unsigned long
* pointer also makes no difference. This is because the first iterated
* unsigned long is composed of bits 0-31 of the u64 and the second unsigned
* long is composed of bits 32-63 of the u64.
*
* However, for 32-bit big-endian hosts, this is not the case. The first
* iterated unsigned long will be bits 32-63 of the u64, so swap these two
* ulong values within the u64.
*/
static void swap_ulong_in_u64(u64 *bits, unsigned int nr)
{
#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN)
unsigned int i;

for (i = 0; i < nr; i++)
bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32);
#endif
}

/**
* bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
* @it: The new bpf_iter_bits to be created
* @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
* @nr_words: The size of the specified memory area, measured in 8-byte units.
* Due to the limitation of memalloc, it can't be greater than 512.
* The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be
* further reduced by the BPF memory allocator implementation.
*
* This function initializes a new bpf_iter_bits structure for iterating over
* a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
Expand All @@ -2892,17 +2918,24 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w

if (!unsafe_ptr__ign || !nr_words)
return -EINVAL;
if (nr_words > BITS_ITER_NR_WORDS_MAX)
return -E2BIG;

/* Optimization for u64 mask */
if (nr_bits == 64) {
err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign);
if (err)
return -EFAULT;

swap_ulong_in_u64(&kit->bits_copy, nr_words);

kit->nr_bits = nr_bits;
return 0;
}

if (bpf_mem_alloc_check_size(false, nr_bytes))
return -E2BIG;

/* Fallback to memalloc */
kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
if (!kit->bits)
Expand All @@ -2914,6 +2947,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
return err;
}

swap_ulong_in_u64(kit->bits, nr_words);

kit->nr_bits = nr_bits;
return 0;
}
Expand All @@ -2930,17 +2965,16 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w
__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
{
struct bpf_iter_bits_kern *kit = (void *)it;
u32 nr_bits = kit->nr_bits;
const unsigned long *bits;
int bit;
int bit = kit->bit, nr_bits = kit->nr_bits;
const void *bits;

if (nr_bits == 0)
if (!nr_bits || bit >= nr_bits)
return NULL;

bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
bit = find_next_bit(bits, nr_bits, kit->bit + 1);
bit = find_next_bit(bits, nr_bits, bit + 1);
if (bit >= nr_bits) {
kit->nr_bits = 0;
kit->bit = bit;
return NULL;
}

Expand Down
2 changes: 1 addition & 1 deletion kernel/bpf/lpm_trie.c
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
if (!key || key->prefixlen > trie->max_prefixlen)
goto find_leftmost;

node_stack = kmalloc_array(trie->max_prefixlen,
node_stack = kmalloc_array(trie->max_prefixlen + 1,
sizeof(struct lpm_trie_node *),
GFP_ATOMIC | __GFP_NOWARN);
if (!node_stack)
Expand Down
14 changes: 13 additions & 1 deletion kernel/bpf/memalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
*/
#define LLIST_NODE_SZ sizeof(struct llist_node)

#define BPF_MEM_ALLOC_SIZE_MAX 4096

/* similar to kmalloc, but sizeof == 8 bucket is gone */
static u8 size_index[24] __ro_after_init = {
3, /* 8 */
Expand Down Expand Up @@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = {

static int bpf_mem_cache_idx(size_t size)
{
if (!size || size > 4096)
if (!size || size > BPF_MEM_ALLOC_SIZE_MAX)
return -1;

if (size <= 192)
Expand Down Expand Up @@ -1005,3 +1007,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)

return !ret ? NULL : ret + LLIST_NODE_SZ;
}

int bpf_mem_alloc_check_size(bool percpu, size_t size)
{
/* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */
if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) ||
(!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ))
return -E2BIG;

return 0;
}
23 changes: 8 additions & 15 deletions kernel/bpf/verifier.c
Original file line number Diff line number Diff line change
Expand Up @@ -6804,20 +6804,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
struct bpf_func_state *state,
enum bpf_access_type t)
{
struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx];
int min_valid_off, max_bpf_stack;

/* If accessing instruction is a spill/fill from bpf_fastcall pattern,
* add room for all caller saved registers below MAX_BPF_STACK.
* In case if bpf_fastcall rewrite won't happen maximal stack depth
* would be checked by check_max_stack_depth_subprog().
*/
max_bpf_stack = MAX_BPF_STACK;
if (aux->fastcall_pattern)
max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE;
int min_valid_off;

if (t == BPF_WRITE || env->allow_uninit_stack)
min_valid_off = -max_bpf_stack;
min_valid_off = -MAX_BPF_STACK;
else
min_valid_off = -state->allocated_stack;

Expand Down Expand Up @@ -17886,9 +17876,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
int i, j, n, err, states_cnt = 0;
bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
bool add_new_state = force_new_state;
bool force_exact;
bool force_new_state, add_new_state, force_exact;

force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
/* Avoid accumulating infinitely long jmp history */
cur->jmp_history_cnt > 40;

/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
Expand All @@ -17898,6 +17890,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* In tests that amounts to up to 50% reduction into total verifier
* memory consumption and 20% verifier time speedup.
*/
add_new_state = force_new_state;
if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
env->insn_processed - env->prev_insn_processed >= 8)
add_new_state = true;
Expand Down
1 change: 1 addition & 0 deletions net/bpf/test_run.c
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ static void reset_ctx(struct xdp_page_head *head)
head->ctx.data_meta = head->orig_ctx.data_meta;
head->ctx.data_end = head->orig_ctx.data_end;
xdp_update_frame_from_buff(&head->ctx, head->frame);
head->frame->mem = head->orig_ctx.rxq->mem;
}

static int xdp_recv_frames(struct xdp_frame **frames, int nframes,
Expand Down
4 changes: 4 additions & 0 deletions net/core/sock_map.c
Original file line number Diff line number Diff line change
Expand Up @@ -1760,6 +1760,10 @@ static int sock_map_link_update_prog(struct bpf_link *link,
ret = -EINVAL;
goto out;
}
if (!sockmap_link->map) {
ret = -ENOLINK;
goto out;
}

ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink,
sockmap_link->attach_type);
Expand Down
7 changes: 4 additions & 3 deletions net/ipv4/tcp_bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -221,11 +221,11 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
int flags,
int *addr_len)
{
struct tcp_sock *tcp = tcp_sk(sk);
int peek = flags & MSG_PEEK;
u32 seq = tcp->copied_seq;
struct sk_psock *psock;
struct tcp_sock *tcp;
int copied = 0;
u32 seq;

if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
Expand All @@ -238,7 +238,8 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
return tcp_recvmsg(sk, msg, len, flags, addr_len);

lock_sock(sk);

tcp = tcp_sk(sk);
seq = tcp->copied_seq;
/* We may have received data on the sk_receive_queue pre-accept and
* then we can not use read_skb in this context because we haven't
* assigned a sk_socket yet so have no link to the ops. The work-around
Expand Down
109 changes: 109 additions & 0 deletions tools/testing/selftests/bpf/map_tests/lpm_trie_map_get_next_key.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// SPDX-License-Identifier: GPL-2.0

#define _GNU_SOURCE
#include <linux/bpf.h>
#include <stdio.h>
#include <stdbool.h>
#include <unistd.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>

#include <bpf/bpf.h>
#include <bpf/libbpf.h>

#include <test_maps.h>

struct test_lpm_key {
__u32 prefix;
__u32 data;
};

struct get_next_key_ctx {
struct test_lpm_key key;
bool start;
bool stop;
int map_fd;
int loop;
};

static void *get_next_key_fn(void *arg)
{
struct get_next_key_ctx *ctx = arg;
struct test_lpm_key next_key;
int i = 0;

while (!ctx->start)
usleep(1);

while (!ctx->stop && i++ < ctx->loop)
bpf_map_get_next_key(ctx->map_fd, &ctx->key, &next_key);

return NULL;
}

static void abort_get_next_key(struct get_next_key_ctx *ctx, pthread_t *tids,
unsigned int nr)
{
unsigned int i;

ctx->stop = true;
ctx->start = true;
for (i = 0; i < nr; i++)
pthread_join(tids[i], NULL);
}

/* This test aims to prevent regression of future. As long as the kernel does
* not panic, it is considered as success.
*/
void test_lpm_trie_map_get_next_key(void)
{
#define MAX_NR_THREADS 8
LIBBPF_OPTS(bpf_map_create_opts, create_opts,
.map_flags = BPF_F_NO_PREALLOC);
struct test_lpm_key key = {};
__u32 val = 0;
int map_fd;
const __u32 max_prefixlen = 8 * (sizeof(key) - sizeof(key.prefix));
const __u32 max_entries = max_prefixlen + 1;
unsigned int i, nr = MAX_NR_THREADS, loop = 65536;
pthread_t tids[MAX_NR_THREADS];
struct get_next_key_ctx ctx;
int err;

map_fd = bpf_map_create(BPF_MAP_TYPE_LPM_TRIE, "lpm_trie_map",
sizeof(struct test_lpm_key), sizeof(__u32),
max_entries, &create_opts);
CHECK(map_fd == -1, "bpf_map_create()", "error:%s\n",
strerror(errno));

for (i = 0; i <= max_prefixlen; i++) {
key.prefix = i;
err = bpf_map_update_elem(map_fd, &key, &val, BPF_ANY);
CHECK(err, "bpf_map_update_elem()", "error:%s\n",
strerror(errno));
}

ctx.start = false;
ctx.stop = false;
ctx.map_fd = map_fd;
ctx.loop = loop;
memcpy(&ctx.key, &key, sizeof(key));

for (i = 0; i < nr; i++) {
err = pthread_create(&tids[i], NULL, get_next_key_fn, &ctx);
if (err) {
abort_get_next_key(&ctx, tids, i);
CHECK(err, "pthread_create", "error %d\n", err);
}
}

ctx.start = true;
for (i = 0; i < nr; i++)
pthread_join(tids[i], NULL);

printf("%s:PASS\n", __func__);

close(map_fd);
}
Loading

0 comments on commit 5635f18

Please sign in to comment.