Skip to content

Commit 68106ec

Browse files
liuhangbinkernel-patches-bot
authored andcommitted
xdp: add a new helper for dev map multicast support
This patch is for xdp multicast support, which has been discussed before[0], The goal is to be able to implement an OVS-like data plane in XDP, i.e., a software switch that can forward XDP frames to multiple ports. To achieve this, an application needs to specify a group of interfaces to forward a packet to. It is also common to want to exclude one or more physical interfaces from the forwarding operation - e.g., to forward a packet to all interfaces in the multicast group except the interface it arrived on. While this could be done simply by adding more groups, this quickly leads to a combinatorial explosion in the number of groups an application has to maintain. To avoid the combinatorial explosion, we propose to include the ability to specify an "exclude group" as part of the forwarding operation. This needs to be a group (instead of just a single port index), because a physical interface can be part of a logical grouping, such as a bond device. Thus, the logical forwarding operation becomes a "set difference" operation, i.e. "forward to all ports in group A that are not also in group B". This series implements such an operation using device maps to represent the groups. This means that the XDP program specifies two device maps, one containing the list of netdevs to redirect to, and the other containing the exclude list. To achieve this, a new helper bpf_redirect_map_multi() is implemented to accept two maps, the forwarding map and exclude map. The forwarding map could be DEVMAP or DEVMAP_HASH, but the exclude map *must* be DEVMAP_HASH to get better performace. If user don't want to use exclude map and just want simply stop redirecting back to ingress device, they can use flag BPF_F_EXCLUDE_INGRESS. As both bpf_xdp_redirect_map() and this new helpers are using struct bpf_redirect_info, a new field ex_map is added and tgt_value is set to NULL in the new helper to make a difference with bpf_xdp_redirect_map(). At last, keep the general data path in net/core/filter.c, the native data path in kernel/bpf/devmap.c so we can use direct calls to get better performace. [0] https://xdp-project.net/#Handling-multicast Acked-by: Toke Høiland-Jørgensen <toke@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
1 parent 5ea0b4a commit 68106ec

File tree

9 files changed

+357
-5
lines changed

9 files changed

+357
-5
lines changed

include/linux/bpf.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1443,6 +1443,11 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
14431443
struct net_device *dev_rx);
14441444
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
14451445
struct net_device *dev_rx);
1446+
bool dev_in_exclude_map(struct bpf_dtab_netdev *obj, struct bpf_map *map,
1447+
int exclude_ifindex);
1448+
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
1449+
struct bpf_map *map, struct bpf_map *ex_map,
1450+
u32 flags);
14461451
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
14471452
struct bpf_prog *xdp_prog);
14481453
bool dev_map_can_have_prog(struct bpf_map *map);
@@ -1615,6 +1620,21 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
16151620
return 0;
16161621
}
16171622

1623+
static inline
1624+
bool dev_in_exclude_map(struct bpf_dtab_netdev *obj, struct bpf_map *map,
1625+
int exclude_ifindex)
1626+
{
1627+
return false;
1628+
}
1629+
1630+
static inline
1631+
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
1632+
struct bpf_map *map, struct bpf_map *ex_map,
1633+
u32 flags)
1634+
{
1635+
return 0;
1636+
}
1637+
16181638
struct sk_buff;
16191639

16201640
static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,

include/linux/filter.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,7 @@ struct bpf_redirect_info {
647647
u32 tgt_index;
648648
void *tgt_value;
649649
struct bpf_map *map;
650+
struct bpf_map *ex_map;
650651
u32 kern_flags;
651652
struct bpf_nh_params nh;
652653
};

include/net/xdp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
170170
struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
171171
struct net_device *dev);
172172
int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
173+
struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);
173174

174175
static inline
175176
void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)

include/uapi/linux/bpf.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3909,6 +3909,27 @@ union bpf_attr {
39093909
* * **BPF_MTU_CHK_RET_FRAG_NEEDED**
39103910
* * **BPF_MTU_CHK_RET_SEGS_TOOBIG**
39113911
*
3912+
* long bpf_redirect_map_multi(struct bpf_map *map, struct bpf_map *ex_map, u64 flags)
3913+
* Description
3914+
* This is a multicast implementation for XDP redirect. It will
3915+
* redirect the packet to ALL the interfaces in *map*, but
3916+
* exclude the interfaces in *ex_map*.
3917+
*
3918+
* The forwarding *map* could be either BPF_MAP_TYPE_DEVMAP or
3919+
* BPF_MAP_TYPE_DEVMAP_HASH. To get better performance, the
3920+
* *ex_map* is limited to BPF_MAP_TYPE_DEVMAP_HASH and must be
3921+
* keyed by ifindex for the helper to work.
3922+
*
3923+
* Currently the *flags* only supports *BPF_F_EXCLUDE_INGRESS*,
3924+
* which additionally excludes the current ingress device.
3925+
*
3926+
* See also bpf_redirect_map() as a unicast implementation,
3927+
* which supports redirecting packet to a specific ifindex
3928+
* in the map. As both helpers use struct bpf_redirect_info
3929+
* to store the redirect info, we will use a a NULL tgt_value
3930+
* to distinguish multicast and unicast redirecting.
3931+
* Return
3932+
* **XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
39123933
*/
39133934
#define __BPF_FUNC_MAPPER(FN) \
39143935
FN(unspec), \
@@ -4075,6 +4096,7 @@ union bpf_attr {
40754096
FN(ima_inode_hash), \
40764097
FN(sock_from_file), \
40774098
FN(check_mtu), \
4099+
FN(redirect_map_multi), \
40784100
/* */
40794101

40804102
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -4251,6 +4273,11 @@ enum {
42514273
BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
42524274
};
42534275

4276+
/* BPF_FUNC_redirect_map_multi flags. */
4277+
enum {
4278+
BPF_F_EXCLUDE_INGRESS = (1ULL << 0),
4279+
};
4280+
42544281
#define __bpf_md_ptr(type, name) \
42554282
union { \
42564283
type name; \

kernel/bpf/devmap.c

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,133 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
519519
return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
520520
}
521521

522+
/* Use direct call in fast path instead of map->ops->map_get_next_key() */
523+
static int devmap_get_next_key(struct bpf_map *map, void *key, void *next_key)
524+
{
525+
switch (map->map_type) {
526+
case BPF_MAP_TYPE_DEVMAP:
527+
return dev_map_get_next_key(map, key, next_key);
528+
case BPF_MAP_TYPE_DEVMAP_HASH:
529+
return dev_map_hash_get_next_key(map, key, next_key);
530+
default:
531+
break;
532+
}
533+
534+
return -ENOENT;
535+
}
536+
537+
bool dev_in_exclude_map(struct bpf_dtab_netdev *obj, struct bpf_map *map,
538+
int exclude_ifindex)
539+
{
540+
if (obj->dev->ifindex == exclude_ifindex)
541+
return true;
542+
543+
if (!map)
544+
return false;
545+
546+
return __dev_map_hash_lookup_elem(map, obj->dev->ifindex) != NULL;
547+
}
548+
549+
static struct bpf_dtab_netdev *devmap_get_next_obj(struct xdp_buff *xdp, struct bpf_map *map,
550+
struct bpf_map *ex_map, u32 *key,
551+
u32 *next_key, int ex_ifindex)
552+
{
553+
struct bpf_dtab_netdev *obj;
554+
struct net_device *dev;
555+
u32 *tmp_key = key;
556+
u32 index;
557+
int err;
558+
559+
err = devmap_get_next_key(map, tmp_key, next_key);
560+
if (err)
561+
return NULL;
562+
563+
/* When using dev map hash, we could restart the hashtab traversal
564+
* in case the key has been updated/removed in the mean time.
565+
* So we may end up potentially looping due to traversal restarts
566+
* from first elem.
567+
*
568+
* Let's use map's max_entries to limit the loop number.
569+
*/
570+
for (index = 0; index < map->max_entries; index++) {
571+
switch (map->map_type) {
572+
case BPF_MAP_TYPE_DEVMAP:
573+
obj = __dev_map_lookup_elem(map, *next_key);
574+
break;
575+
case BPF_MAP_TYPE_DEVMAP_HASH:
576+
obj = __dev_map_hash_lookup_elem(map, *next_key);
577+
break;
578+
default:
579+
break;
580+
}
581+
582+
if (!obj || dev_in_exclude_map(obj, ex_map, ex_ifindex))
583+
goto find_next;
584+
585+
dev = obj->dev;
586+
587+
if (!dev->netdev_ops->ndo_xdp_xmit)
588+
goto find_next;
589+
590+
err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
591+
if (unlikely(err))
592+
goto find_next;
593+
594+
return obj;
595+
596+
find_next:
597+
tmp_key = next_key;
598+
err = devmap_get_next_key(map, tmp_key, next_key);
599+
if (err)
600+
break;
601+
}
602+
603+
return NULL;
604+
}
605+
606+
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
607+
struct bpf_map *map, struct bpf_map *ex_map,
608+
u32 flags)
609+
{
610+
struct bpf_dtab_netdev *obj = NULL, *next_obj = NULL;
611+
struct xdp_frame *xdpf, *nxdpf;
612+
int ex_ifindex;
613+
u32 key, next_key;
614+
615+
ex_ifindex = flags & BPF_F_EXCLUDE_INGRESS ? dev_rx->ifindex : 0;
616+
617+
/* Find first available obj */
618+
obj = devmap_get_next_obj(xdp, map, ex_map, NULL, &key, ex_ifindex);
619+
if (!obj)
620+
return 0;
621+
622+
xdpf = xdp_convert_buff_to_frame(xdp);
623+
if (unlikely(!xdpf))
624+
return -EOVERFLOW;
625+
626+
for (;;) {
627+
/* Check if we still have one more available obj */
628+
next_obj = devmap_get_next_obj(xdp, map, ex_map, &key,
629+
&next_key, ex_ifindex);
630+
if (!next_obj) {
631+
bq_enqueue(obj->dev, xdpf, dev_rx, obj->xdp_prog);
632+
return 0;
633+
}
634+
635+
nxdpf = xdpf_clone(xdpf);
636+
if (unlikely(!nxdpf)) {
637+
xdp_return_frame_rx_napi(xdpf);
638+
return -ENOMEM;
639+
}
640+
641+
bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
642+
643+
/* Deal with next obj */
644+
obj = next_obj;
645+
key = next_key;
646+
}
647+
}
648+
522649
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
523650
struct bpf_prog *xdp_prog)
524651
{

kernel/bpf/verifier.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4884,6 +4884,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
48844884
case BPF_MAP_TYPE_DEVMAP:
48854885
case BPF_MAP_TYPE_DEVMAP_HASH:
48864886
if (func_id != BPF_FUNC_redirect_map &&
4887+
func_id != BPF_FUNC_redirect_map_multi &&
48874888
func_id != BPF_FUNC_map_lookup_elem)
48884889
goto error;
48894890
break;
@@ -4988,6 +4989,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
49884989
map->map_type != BPF_MAP_TYPE_XSKMAP)
49894990
goto error;
49904991
break;
4992+
case BPF_FUNC_redirect_map_multi:
4993+
if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
4994+
map->map_type != BPF_MAP_TYPE_DEVMAP_HASH)
4995+
goto error;
4996+
break;
49914997
case BPF_FUNC_sk_redirect_map:
49924998
case BPF_FUNC_msg_redirect_map:
49934999
case BPF_FUNC_sock_map_update:

0 commit comments

Comments
 (0)