Skip to content

Commit

Permalink
Merge branch 'vxlan-support-user-defined-reserved-bits'
Browse files Browse the repository at this point in the history
Petr Machata says:

====================
vxlan: Support user-defined reserved bits

Currently the VXLAN header validation works by vxlan_rcv() going feature
by feature, each feature clearing the bits that it consumes. If anything
is left unparsed at the end, the packet is rejected.

Unfortunately there are machines out there that send VXLAN packets with
reserved bits set, even if they are configured to not use the
corresponding features. One such report is here[1], and we have heard
similar complaints from our customers as well.

This patchset adds an attribute that makes it configurable which bits
the user wishes to tolerate and which they consider reserved. This was
recommended in [1] as well.

A knob like that inevitably allows users to set as reserved bits that
are in fact required for the features enabled by the netdevice, such as
GPE. This is detected, and such configurations are rejected.

In patches #1..#7, the reserved bits validation code is gradually moved
away from the unparsed approach described above, to one where a given
set of valid bits is precomputed and then the packet is validated
against that.

In patch #8, this precomputed set is made configurable through a new
attribute IFLA_VXLAN_RESERVED_BITS.

Patches #9 and #10 massage the testsuite a bit, so that patch #11 can
introduce a selftest for the resreved bits feature.

The corresponding iproute2 support is available in [2].

[1] https://lore.kernel.org/netdev/db8b9e19-ad75-44d3-bfb2-46590d426ff5@proxmox.com/
[2] https://github.com/pmachata/iproute2/commits/vxlan_reserved_bits/
====================

Link: https://patch.msgid.link/cover.1733412063.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
kuba-moo committed Dec 9, 2024
2 parents 3f330db + d84b5dc commit e58b477
Show file tree
Hide file tree
Showing 8 changed files with 497 additions and 62 deletions.
150 changes: 99 additions & 51 deletions drivers/net/vxlan/vxlan_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -622,9 +622,9 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
return 1;
}

static bool vxlan_parse_gpe_proto(struct vxlanhdr *hdr, __be16 *protocol)
static bool vxlan_parse_gpe_proto(const struct vxlanhdr *hdr, __be16 *protocol)
{
struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)hdr;
const struct vxlanhdr_gpe *gpe = (const struct vxlanhdr_gpe *)hdr;

/* Need to have Next Protocol set for interfaces in GPE mode. */
if (!gpe->np_applied)
Expand Down Expand Up @@ -1554,41 +1554,38 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan)
#endif
}

static enum skb_drop_reason vxlan_remcsum(struct vxlanhdr *unparsed,
struct sk_buff *skb,
u32 vxflags)
static enum skb_drop_reason vxlan_remcsum(struct sk_buff *skb, u32 vxflags)
{
const struct vxlanhdr *vh = vxlan_hdr(skb);
enum skb_drop_reason reason;
size_t start, offset;

if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
goto out;
if (!(vh->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
return SKB_NOT_DROPPED_YET;

start = vxlan_rco_start(unparsed->vx_vni);
offset = start + vxlan_rco_offset(unparsed->vx_vni);
start = vxlan_rco_start(vh->vx_vni);
offset = start + vxlan_rco_offset(vh->vx_vni);

reason = pskb_may_pull_reason(skb, offset + sizeof(u16));
if (reason)
return reason;

skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset,
!!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL));
out:
unparsed->vx_flags &= ~VXLAN_HF_RCO;
unparsed->vx_vni &= VXLAN_VNI_MASK;

return SKB_NOT_DROPPED_YET;
}

static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed,
struct sk_buff *skb, u32 vxflags,
static void vxlan_parse_gbp_hdr(struct sk_buff *skb, u32 vxflags,
struct vxlan_metadata *md)
{
struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed;
const struct vxlanhdr *vh = vxlan_hdr(skb);
const struct vxlanhdr_gbp *gbp;
struct metadata_dst *tun_dst;

if (!(unparsed->vx_flags & VXLAN_HF_GBP))
goto out;
gbp = (const struct vxlanhdr_gbp *)vh;

if (!(vh->vx_flags & VXLAN_HF_GBP))
return;

md->gbp = ntohs(gbp->policy_id);

Expand All @@ -1607,8 +1604,6 @@ static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed,
/* In flow-based mode, GBP is carried in dst_metadata */
if (!(vxflags & VXLAN_F_COLLECT_METADATA))
skb->mark = md->gbp;
out:
unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS;
}

static enum skb_drop_reason vxlan_set_mac(struct vxlan_dev *vxlan,
Expand Down Expand Up @@ -1672,9 +1667,9 @@ static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
{
struct vxlan_vni_node *vninode = NULL;
const struct vxlanhdr *vh;
struct vxlan_dev *vxlan;
struct vxlan_sock *vs;
struct vxlanhdr unparsed;
struct vxlan_metadata _md;
struct vxlan_metadata *md = &_md;
__be16 protocol = htons(ETH_P_TEB);
Expand All @@ -1689,38 +1684,49 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
if (reason)
goto drop;

unparsed = *vxlan_hdr(skb);
vh = vxlan_hdr(skb);
/* VNI flag always required to be set */
if (!(unparsed.vx_flags & VXLAN_HF_VNI)) {
if (!(vh->vx_flags & VXLAN_HF_VNI)) {
netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
ntohl(vxlan_hdr(skb)->vx_flags),
ntohl(vxlan_hdr(skb)->vx_vni));
ntohl(vh->vx_flags), ntohl(vh->vx_vni));
reason = SKB_DROP_REASON_VXLAN_INVALID_HDR;
/* Return non vxlan pkt */
goto drop;
}
unparsed.vx_flags &= ~VXLAN_HF_VNI;
unparsed.vx_vni &= ~VXLAN_VNI_MASK;

vs = rcu_dereference_sk_user_data(sk);
if (!vs)
goto drop;

vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);
vni = vxlan_vni(vh->vx_vni);

vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode);
if (!vxlan) {
reason = SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND;
goto drop;
}

/* For backwards compatibility, only allow reserved fields to be
* used by VXLAN extensions if explicitly requested.
*/
if (vs->flags & VXLAN_F_GPE) {
if (!vxlan_parse_gpe_proto(&unparsed, &protocol))
if (vh->vx_flags & vxlan->cfg.reserved_bits.vx_flags ||
vh->vx_vni & vxlan->cfg.reserved_bits.vx_vni) {
/* If the header uses bits besides those enabled by the
* netdevice configuration, treat this as a malformed packet.
* This behavior diverges from VXLAN RFC (RFC7348) which
* stipulates that bits in reserved in reserved fields are to be
* ignored. The approach here maintains compatibility with
* previous stack code, and also is more robust and provides a
* little more security in adding extensions to VXLAN.
*/
reason = SKB_DROP_REASON_VXLAN_INVALID_HDR;
DEV_STATS_INC(vxlan->dev, rx_frame_errors);
DEV_STATS_INC(vxlan->dev, rx_errors);
vxlan_vnifilter_count(vxlan, vni, vninode,
VXLAN_VNI_STATS_RX_ERRORS, 0);
goto drop;
}

if (vxlan->cfg.flags & VXLAN_F_GPE) {
if (!vxlan_parse_gpe_proto(vh, &protocol))
goto drop;
unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS;
raw_proto = true;
}

Expand All @@ -1730,8 +1736,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
goto drop;
}

if (vs->flags & VXLAN_F_REMCSUM_RX) {
reason = vxlan_remcsum(&unparsed, skb, vs->flags);
if (vxlan->cfg.flags & VXLAN_F_REMCSUM_RX) {
reason = vxlan_remcsum(skb, vxlan->cfg.flags);
if (unlikely(reason))
goto drop;
}
Expand All @@ -1756,25 +1762,12 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
memset(md, 0, sizeof(*md));
}

if (vs->flags & VXLAN_F_GBP)
vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md);
if (vxlan->cfg.flags & VXLAN_F_GBP)
vxlan_parse_gbp_hdr(skb, vxlan->cfg.flags, md);
/* Note that GBP and GPE can never be active together. This is
* ensured in vxlan_dev_configure.
*/

if (unparsed.vx_flags || unparsed.vx_vni) {
/* If there are any unprocessed flags remaining treat
* this as a malformed packet. This behavior diverges from
* VXLAN RFC (RFC7348) which stipulates that bits in reserved
* in reserved fields are to be ignored. The approach here
* maintains compatibility with previous stack code, and also
* is more robust and provides a little more security in
* adding extensions to VXLAN.
*/
reason = SKB_DROP_REASON_VXLAN_INVALID_HDR;
goto drop;
}

if (!raw_proto) {
reason = vxlan_set_mac(vxlan, vs, skb, vni);
if (reason)
Expand Down Expand Up @@ -3435,6 +3428,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
[IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 },
[IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1),
[IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX),
[IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)),
};

static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
Expand Down Expand Up @@ -4070,6 +4064,10 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
struct net_device *dev, struct vxlan_config *conf,
bool changelink, struct netlink_ext_ack *extack)
{
struct vxlanhdr used_bits = {
.vx_flags = VXLAN_HF_VNI,
.vx_vni = VXLAN_VNI_MASK,
};
struct vxlan_dev *vxlan = netdev_priv(dev);
int err = 0;

Expand Down Expand Up @@ -4296,13 +4294,16 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
extack);
if (err)
return err;
used_bits.vx_flags |= VXLAN_HF_RCO;
used_bits.vx_vni |= ~VXLAN_VNI_MASK;
}

if (data[IFLA_VXLAN_GBP]) {
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP,
VXLAN_F_GBP, changelink, false, extack);
if (err)
return err;
used_bits.vx_flags |= VXLAN_GBP_USED_BITS;
}

if (data[IFLA_VXLAN_GPE]) {
Expand All @@ -4311,6 +4312,46 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
extack);
if (err)
return err;

used_bits.vx_flags |= VXLAN_GPE_USED_BITS;
}

if (data[IFLA_VXLAN_RESERVED_BITS]) {
struct vxlanhdr reserved_bits;

if (changelink) {
NL_SET_ERR_MSG_ATTR(extack,
data[IFLA_VXLAN_RESERVED_BITS],
"Cannot change reserved_bits");
return -EOPNOTSUPP;
}

nla_memcpy(&reserved_bits, data[IFLA_VXLAN_RESERVED_BITS],
sizeof(reserved_bits));
if (used_bits.vx_flags & reserved_bits.vx_flags ||
used_bits.vx_vni & reserved_bits.vx_vni) {
__be64 ub_be64, rb_be64;

memcpy(&ub_be64, &used_bits, sizeof(ub_be64));
memcpy(&rb_be64, &reserved_bits, sizeof(rb_be64));

NL_SET_ERR_MSG_ATTR_FMT(extack,
data[IFLA_VXLAN_RESERVED_BITS],
"Used bits %#018llx cannot overlap reserved bits %#018llx",
be64_to_cpu(ub_be64),
be64_to_cpu(rb_be64));
return -EINVAL;
}

conf->reserved_bits = reserved_bits;
} else {
/* For backwards compatibility, only allow reserved fields to be
* used by VXLAN extensions if explicitly requested.
*/
conf->reserved_bits = (struct vxlanhdr) {
.vx_flags = ~used_bits.vx_flags,
.vx_vni = ~used_bits.vx_vni,
};
}

if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) {
Expand Down Expand Up @@ -4497,6 +4538,8 @@ static size_t vxlan_get_size(const struct net_device *dev)
nla_total_size(0) + /* IFLA_VXLAN_GPE */
nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */
/* IFLA_VXLAN_RESERVED_BITS */
nla_total_size(sizeof(struct vxlanhdr)) +
0;
}

Expand Down Expand Up @@ -4599,6 +4642,11 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
!!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)))
goto nla_put_failure;

if (nla_put(skb, IFLA_VXLAN_RESERVED_BITS,
sizeof(vxlan->cfg.reserved_bits),
&vxlan->cfg.reserved_bits))
goto nla_put_failure;

return 0;

nla_put_failure:
Expand Down
1 change: 1 addition & 0 deletions include/net/vxlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ struct vxlan_config {
unsigned int addrmax;
bool no_share;
enum ifla_vxlan_df df;
struct vxlanhdr reserved_bits;
};

enum {
Expand Down
1 change: 1 addition & 0 deletions include/uapi/linux/if_link.h
Original file line number Diff line number Diff line change
Expand Up @@ -1394,6 +1394,7 @@ enum {
IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
IFLA_VXLAN_LOCALBYPASS,
IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */
IFLA_VXLAN_RESERVED_BITS,
__IFLA_VXLAN_MAX
};
#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
Expand Down
6 changes: 3 additions & 3 deletions tools/testing/selftests/net/fdb_notify.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ test_dup_vxlan_self()
{
ip_link_add br up type bridge vlan_filtering 1
ip_link_add vx up type vxlan id 2000 dstport 4789
ip_link_master vx br
ip_link_set_master vx br

do_test_dup add "vxlan" dev vx self dst 192.0.2.1
do_test_dup del "vxlan" dev vx self dst 192.0.2.1
Expand All @@ -59,7 +59,7 @@ test_dup_vxlan_master()
{
ip_link_add br up type bridge vlan_filtering 1
ip_link_add vx up type vxlan id 2000 dstport 4789
ip_link_master vx br
ip_link_set_master vx br

do_test_dup add "vxlan master" dev vx master
do_test_dup del "vxlan master" dev vx master
Expand All @@ -79,7 +79,7 @@ test_dup_macvlan_master()
ip_link_add br up type bridge vlan_filtering 1
ip_link_add dd up type dummy
ip_link_add mv up link dd type macvlan mode passthru
ip_link_master mv br
ip_link_set_master mv br

do_test_dup add "macvlan master" dev mv self
do_test_dup del "macvlan master" dev mv self
Expand Down
1 change: 1 addition & 0 deletions tools/testing/selftests/net/forwarding/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \
vxlan_bridge_1q_port_8472_ipv6.sh \
vxlan_bridge_1q_port_8472.sh \
vxlan_bridge_1q.sh \
vxlan_reserved.sh \
vxlan_symmetric_ipv6.sh \
vxlan_symmetric.sh

Expand Down
7 changes: 0 additions & 7 deletions tools/testing/selftests/net/forwarding/lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -932,13 +932,6 @@ packets_rate()
echo $(((t1 - t0) / interval))
}

mac_get()
{
local if_name=$1

ip -j link show dev $if_name | jq -r '.[]["address"]'
}

ether_addr_to_u64()
{
local addr="$1"
Expand Down
Loading

0 comments on commit e58b477

Please sign in to comment.