Skip to content

Commit

Permalink
net: Add a bhash2 table hashed by port and address
Browse files Browse the repository at this point in the history
The current bind hashtable (bhash) is hashed by port only.
In the socket bind path, we have to check for bind conflicts by
traversing the specified port's inet_bind_bucket while holding the
hashbucket's spinlock (see inet_csk_get_port() and
inet_csk_bind_conflict()). In instances where there are tons of
sockets hashed to the same port at different addresses, the bind
conflict check is time-intensive and can cause softirq cpu lockups,
as well as stops new tcp connections since __inet_inherit_port()
also contests for the spinlock.

This patch adds a second bind table, bhash2, that hashes by
port and sk->sk_rcv_saddr (ipv4) and sk->sk_v6_rcv_saddr (ipv6).
Searching the bhash2 table leads to significantly faster conflict
resolution and less time holding the hashbucket spinlock.

Please note a few things:
* There can be the case where the a socket's address changes after it
has been bound. There are two cases where this happens:

  1) The case where there is a bind() call on INADDR_ANY (ipv4) or
  IPV6_ADDR_ANY (ipv6) and then a connect() call. The kernel will
  assign the socket an address when it handles the connect()

  2) In inet_sk_reselect_saddr(), which is called when rebuilding the
  sk header and a few pre-conditions are met (eg rerouting fails).

In these two cases, we need to update the bhash2 table by removing the
entry for the old address, and add a new entry reflecting the updated
address.

* The bhash2 table must have its own lock, even though concurrent
accesses on the same port are protected by the bhash lock. Bhash2 must
have its own lock to protect against cases where sockets on different
ports hash to different bhash hashbuckets but to the same bhash2
hashbucket.

This brings up a few stipulations:
  1) When acquiring both the bhash and the bhash2 lock, the bhash2 lock
  will always be acquired after the bhash lock and released before the
  bhash lock is released.

  2) There are no nested bhash2 hashbucket locks. A bhash2 lock is always
  acquired+released before another bhash2 lock is acquired+released.

* The bhash table cannot be superseded by the bhash2 table because for
bind requests on INADDR_ANY (ipv4) or IPV6_ADDR_ANY (ipv6), every socket
bound to that port must be checked for a potential conflict. The bhash
table is the only source of port->socket associations.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
joannekoong authored and kuba-moo committed Aug 25, 2022
1 parent 0bf7325 commit 28044fc
Show file tree
Hide file tree
Showing 12 changed files with 700 additions and 94 deletions.
3 changes: 3 additions & 0 deletions include/net/inet_connection_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#undef INET_CSK_CLEAR_TIMERS

struct inet_bind_bucket;
struct inet_bind2_bucket;
struct tcp_congestion_ops;

/*
Expand Down Expand Up @@ -57,6 +58,7 @@ struct inet_connection_sock_af_ops {
*
* @icsk_accept_queue: FIFO of established children
* @icsk_bind_hash: Bind node
* @icsk_bind2_hash: Bind node in the bhash2 table
* @icsk_timeout: Timeout
* @icsk_retransmit_timer: Resend (no ack)
* @icsk_rto: Retransmit timeout
Expand All @@ -83,6 +85,7 @@ struct inet_connection_sock {
struct inet_sock icsk_inet;
struct request_sock_queue icsk_accept_queue;
struct inet_bind_bucket *icsk_bind_hash;
struct inet_bind2_bucket *icsk_bind2_hash;
unsigned long icsk_timeout;
struct timer_list icsk_retransmit_timer;
struct timer_list icsk_delack_timer;
Expand Down
80 changes: 78 additions & 2 deletions include/net/inet_hashtables.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include <net/inet_connection_sock.h>
#include <net/inet_sock.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/route.h>
#include <net/tcp_states.h>
Expand Down Expand Up @@ -90,7 +91,28 @@ struct inet_bind_bucket {
struct hlist_head owners;
};

static inline struct net *ib_net(struct inet_bind_bucket *ib)
struct inet_bind2_bucket {
possible_net_t ib_net;
int l3mdev;
unsigned short port;
union {
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr v6_rcv_saddr;
#endif
__be32 rcv_saddr;
};
/* Node in the bhash2 inet_bind_hashbucket chain */
struct hlist_node node;
/* List of sockets hashed to this bucket */
struct hlist_head owners;
};

static inline struct net *ib_net(const struct inet_bind_bucket *ib)
{
return read_pnet(&ib->ib_net);
}

static inline struct net *ib2_net(const struct inet_bind2_bucket *ib)
{
return read_pnet(&ib->ib_net);
}
Expand Down Expand Up @@ -133,7 +155,14 @@ struct inet_hashinfo {
* TCP hash as well as the others for fast bind/connect.
*/
struct kmem_cache *bind_bucket_cachep;
/* This bind table is hashed by local port */
struct inet_bind_hashbucket *bhash;
struct kmem_cache *bind2_bucket_cachep;
/* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4)
* or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used
* primarily for expediting bind conflict resolution.
*/
struct inet_bind_hashbucket *bhash2;
unsigned int bhash_size;

/* The 2nd listener table hashed by local port and address */
Expand Down Expand Up @@ -182,14 +211,61 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
void inet_bind_bucket_destroy(struct kmem_cache *cachep,
struct inet_bind_bucket *tb);

bool inet_bind_bucket_match(const struct inet_bind_bucket *tb,
const struct net *net, unsigned short port,
int l3mdev);

struct inet_bind2_bucket *
inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
unsigned short port, int l3mdev,
const struct sock *sk);

void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
struct inet_bind2_bucket *tb);

struct inet_bind2_bucket *
inet_bind2_bucket_find(const struct inet_bind_hashbucket *head,
const struct net *net,
unsigned short port, int l3mdev,
const struct sock *sk);

bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb,
const struct net *net, unsigned short port,
int l3mdev, const struct sock *sk);

static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
const u32 bhash_size)
{
return (lport + net_hash_mix(net)) & (bhash_size - 1);
}

static inline struct inet_bind_hashbucket *
inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk,
const struct net *net, unsigned short port)
{
u32 hash;

#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == AF_INET6)
hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port);
else
#endif
hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port);
return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
}

struct inet_bind_hashbucket *
inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port);

/* This should be called whenever a socket's sk_rcv_saddr (ipv4) or
* sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's
* rcv_saddr field should already have been updated when this is called.
*/
int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk);

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
const unsigned short snum);
struct inet_bind2_bucket *tb2, unsigned short port);

/* Caller must disable local BH processing. */
int __inet_inherit_port(const struct sock *sk, struct sock *child);
Expand Down
14 changes: 14 additions & 0 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ struct sk_filter;
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags
* @ns_tracker: tracker for netns reference
* @sk_bind2_node: bind node in the bhash2 table
*/
struct sock {
/*
Expand Down Expand Up @@ -537,6 +538,7 @@ struct sock {
#endif
struct rcu_head sk_rcu;
netns_tracker ns_tracker;
struct hlist_node sk_bind2_node;
};

enum sk_pacing {
Expand Down Expand Up @@ -870,6 +872,16 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_add_head(&sk->sk_bind_node, list);
}

static inline void __sk_del_bind2_node(struct sock *sk)
{
__hlist_del(&sk->sk_bind2_node);
}

static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
{
hlist_add_head(&sk->sk_bind2_node, list);
}

#define sk_for_each(__sk, list) \
hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \
Expand All @@ -887,6 +899,8 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind_node)
#define sk_for_each_bound_bhash2(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind2_node)

/**
* sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
Expand Down
25 changes: 23 additions & 2 deletions net/dccp/ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,11 @@ static unsigned int dccp_v4_pernet_id __read_mostly;
int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
__be32 daddr, nexthop, prev_sk_rcv_saddr;
struct inet_sock *inet = inet_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
__be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
struct flowi4 *fl4;
struct rtable *rt;
int err;
Expand Down Expand Up @@ -89,9 +90,29 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (inet_opt == NULL || !inet_opt->opt.srr)
daddr = fl4->daddr;

if (inet->inet_saddr == 0)
if (inet->inet_saddr == 0) {
if (inet_csk(sk)->icsk_bind2_hash) {
prev_addr_hashbucket =
inet_bhashfn_portaddr(&dccp_hashinfo, sk,
sock_net(sk),
inet->inet_num);
prev_sk_rcv_saddr = sk->sk_rcv_saddr;
}
inet->inet_saddr = fl4->saddr;
}

sk_rcv_saddr_set(sk, inet->inet_saddr);

if (prev_addr_hashbucket) {
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
if (err) {
inet->inet_saddr = 0;
sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
ip_rt_put(rt);
return err;
}
}

inet->inet_dport = usin->sin_port;
sk_daddr_set(sk, daddr);

Expand Down
18 changes: 18 additions & 0 deletions net/dccp/ipv6.c
Original file line number Diff line number Diff line change
Expand Up @@ -934,8 +934,26 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
}

if (saddr == NULL) {
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
struct in6_addr prev_v6_rcv_saddr;

if (icsk->icsk_bind2_hash) {
prev_addr_hashbucket = inet_bhashfn_portaddr(&dccp_hashinfo,
sk, sock_net(sk),
inet->inet_num);
prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
}

saddr = &fl6.saddr;
sk->sk_v6_rcv_saddr = *saddr;

if (prev_addr_hashbucket) {
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
if (err) {
sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr;
goto failure;
}
}
}

/* set the source address */
Expand Down
34 changes: 29 additions & 5 deletions net/dccp/proto.c
Original file line number Diff line number Diff line change
Expand Up @@ -1120,6 +1120,12 @@ static int __init dccp_init(void)
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
if (!dccp_hashinfo.bind_bucket_cachep)
goto out_free_hashinfo2;
dccp_hashinfo.bind2_bucket_cachep =
kmem_cache_create("dccp_bind2_bucket",
sizeof(struct inet_bind2_bucket), 0,
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
if (!dccp_hashinfo.bind2_bucket_cachep)
goto out_free_bind_bucket_cachep;

/*
* Size and allocate the main established and bind bucket
Expand Down Expand Up @@ -1150,7 +1156,7 @@ static int __init dccp_init(void)

if (!dccp_hashinfo.ehash) {
DCCP_CRIT("Failed to allocate DCCP established hash table");
goto out_free_bind_bucket_cachep;
goto out_free_bind2_bucket_cachep;
}

for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
Expand All @@ -1176,14 +1182,24 @@ static int __init dccp_init(void)
goto out_free_dccp_locks;
}

dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *)
__get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order);

if (!dccp_hashinfo.bhash2) {
DCCP_CRIT("Failed to allocate DCCP bind2 hash table");
goto out_free_dccp_bhash;
}

for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
spin_lock_init(&dccp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
spin_lock_init(&dccp_hashinfo.bhash2[i].lock);
INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
}

rc = dccp_mib_init();
if (rc)
goto out_free_dccp_bhash;
goto out_free_dccp_bhash2;

rc = dccp_ackvec_init();
if (rc)
Expand All @@ -1207,30 +1223,38 @@ static int __init dccp_init(void)
dccp_ackvec_exit();
out_free_dccp_mib:
dccp_mib_exit();
out_free_dccp_bhash2:
free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
out_free_dccp_bhash:
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
out_free_dccp_locks:
inet_ehash_locks_free(&dccp_hashinfo);
out_free_dccp_ehash:
free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
out_free_bind2_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep);
out_free_bind_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
out_free_hashinfo2:
inet_hashinfo2_free_mod(&dccp_hashinfo);
out_fail:
dccp_hashinfo.bhash = NULL;
dccp_hashinfo.bhash2 = NULL;
dccp_hashinfo.ehash = NULL;
dccp_hashinfo.bind_bucket_cachep = NULL;
dccp_hashinfo.bind2_bucket_cachep = NULL;
return rc;
}

static void __exit dccp_fini(void)
{
int bhash_order = get_order(dccp_hashinfo.bhash_size *
sizeof(struct inet_bind_hashbucket));

ccid_cleanup_builtins();
dccp_mib_exit();
free_pages((unsigned long)dccp_hashinfo.bhash,
get_order(dccp_hashinfo.bhash_size *
sizeof(struct inet_bind_hashbucket)));
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
free_pages((unsigned long)dccp_hashinfo.ehash,
get_order((dccp_hashinfo.ehash_mask + 1) *
sizeof(struct inet_ehash_bucket)));
Expand Down
26 changes: 21 additions & 5 deletions net/ipv4/af_inet.c
Original file line number Diff line number Diff line change
Expand Up @@ -1219,13 +1219,15 @@ EXPORT_SYMBOL(inet_unregister_protosw);

static int inet_sk_reselect_saddr(struct sock *sk)
{
struct inet_bind_hashbucket *prev_addr_hashbucket;
struct inet_sock *inet = inet_sk(sk);
__be32 old_saddr = inet->inet_saddr;
__be32 daddr = inet->inet_daddr;
struct flowi4 *fl4;
struct rtable *rt;
__be32 new_saddr;
struct ip_options_rcu *inet_opt;
int err;

inet_opt = rcu_dereference_protected(inet->inet_opt,
lockdep_sock_is_held(sk));
Expand All @@ -1240,20 +1242,34 @@ static int inet_sk_reselect_saddr(struct sock *sk)
if (IS_ERR(rt))
return PTR_ERR(rt);

sk_setup_caps(sk, &rt->dst);

new_saddr = fl4->saddr;

if (new_saddr == old_saddr)
if (new_saddr == old_saddr) {
sk_setup_caps(sk, &rt->dst);
return 0;
}

prev_addr_hashbucket =
inet_bhashfn_portaddr(sk->sk_prot->h.hashinfo, sk,
sock_net(sk), inet->inet_num);

inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;

err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
if (err) {
inet->inet_saddr = old_saddr;
inet->inet_rcv_saddr = old_saddr;
ip_rt_put(rt);
return err;
}

sk_setup_caps(sk, &rt->dst);

if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
__func__, &old_saddr, &new_saddr);
}

inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;

/*
* XXX The only one ugly spot where we need to
* XXX really change the sockets identity after
Expand Down
Loading

0 comments on commit 28044fc

Please sign in to comment.