From 509e31c81fec3e544e4985bb2961020d6835967b Mon Sep 17 00:00:00 2001 From: Alexander K Date: Sun, 19 Jun 2022 22:30:27 +0300 Subject: [PATCH 01/26] Convert errors handling in tfw_sock_clnt_init() to more traditional way --- fw/sock_clnt.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index 7c6d7b0a4..6cafc839d 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -794,29 +794,27 @@ tfw_sock_clnt_init(void) * Check that flags for SS layer and Connection * layer are not overlapping. */ - BUILD_BUG_ON(Conn_Stop & (Conn_Clnt | - Conn_Srv | - TFW_FSM_HTTP | - TFW_FSM_HTTPS)); + BUILD_BUG_ON(Conn_Stop & (Conn_Clnt | Conn_Srv + | TFW_FSM_HTTP | TFW_FSM_HTTPS)); BUG_ON(tfw_cli_conn_cache); BUG_ON(tfw_h2_conn_cache); tfw_cli_conn_cache = kmem_cache_create("tfw_cli_conn_cache", sizeof(TfwCliConn), 0, 0, NULL); + if (!tfw_cli_conn_cache) + return -ENOMEM; + tfw_h2_conn_cache = kmem_cache_create("tfw_h2_conn_cache", sizeof(TfwH2Conn), 0, 0, NULL); - if (tfw_cli_conn_cache && tfw_h2_conn_cache) { - tfw_mod_register(&tfw_sock_clnt_mod); - return 0; + if (!tfw_h2_conn_cache) { + kmem_cache_destroy(tfw_cli_conn_cache); + return -ENOMEM; } - if (tfw_cli_conn_cache) - kmem_cache_destroy(tfw_cli_conn_cache); - if (tfw_h2_conn_cache) - kmem_cache_destroy(tfw_h2_conn_cache); + tfw_mod_register(&tfw_sock_clnt_mod); - return -ENOMEM; + return 0; } void From 195873312a70aa7c646599cce1758678857316f0 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Mon, 20 Jun 2022 06:07:47 +0300 Subject: [PATCH 02/26] Store TFW_FSM_H2 in SsProto->type to separate HTTP/2 listening and established sockets from HTTPS. Now we can use more efficient TFW_FSM_TYPE(conn->proto.type) == TFW_FSM_H2 instead of TFW_CONN_H2(conn) and initialize HTTP/2 connection descriptor only for HTTP/2 connections, not HTTPS as previously. Add a new alpn_match_cb hook for TLS, which validates client ALPN matching one of ours ALPN against type of the socket, so if client sends up ALPN "h2" on a TCP connection established on a socket with listen ... proto=https; , then the connection won't be established. --- fw/connection.h | 9 ++++----- fw/gfsm.h | 18 +++++++++++++++--- fw/http.c | 15 ++++++++------- fw/sock.c | 7 ------- fw/sock_clnt.c | 40 ++++++++++++++++++++++++---------------- fw/sync_socket.h | 7 ++++++- fw/tls.c | 38 ++++++++++++++++++++++++++++---------- tls/tls_srv.c | 5 ++++- tls/ttls.c | 5 ++++- tls/ttls.h | 1 + 10 files changed, 94 insertions(+), 51 deletions(-) diff --git a/fw/connection.h b/fw/connection.h index a169a2214..23ffc2c4c 100644 --- a/fw/connection.h +++ b/fw/connection.h @@ -52,6 +52,10 @@ enum { Conn_HttpsClnt = Conn_Clnt | TFW_FSM_HTTPS, Conn_HttpsSrv = Conn_Srv | TFW_FSM_HTTPS, + /* HTTP/2 */ + Conn_H2Clnt = Conn_Clnt | TFW_FSM_H2, + Conn_H2Srv = Conn_Srv | TFW_FSM_H2, + /* Websocket plain */ Conn_WsClnt = Conn_HttpClnt | TFW_FSM_WEBSOCKET, Conn_WsSrv = Conn_HttpSrv | TFW_FSM_WEBSOCKET, @@ -235,11 +239,6 @@ typedef struct { #define tfw_h2_context(conn) ((TfwH2Ctx *)(&((TfwH2Conn *)conn)->h2)) -#define TFW_CONN_H2(c) \ - (TFW_CONN_TLS((TfwConn *)c) \ - && tfw_tls_context(c)->alpn_chosen \ - && tfw_tls_context(c)->alpn_chosen->id == TTLS_ALPN_ID_HTTP2) - /* Callbacks used by l5-l7 protocols to operate on connection level. */ typedef struct { /* diff --git a/fw/gfsm.h b/fw/gfsm.h index 027e4fcf8..0c3cc4454 100644 --- a/fw/gfsm.h +++ b/fw/gfsm.h @@ -79,13 +79,25 @@ * build real stack. This simplifies the protocols handling, makes it faster * and provides more flexibility to set classification FSMs' hooks for * specific secured application protocol. + * + * The constants are used in the less significant byte for connection or socket + * type, see also enums in connection.h and sync_socket.h. + * + * TODO #77: probably we should get rid of the FRANG states, call the http + * limiting callbacks directly and make the enum purely for the protocol + * type. Then the enum in connection.h determines server/client connection + * and from sync_socket.h the state of the connection. Now this enum looks + * inconsistent. */ enum { /* Protocols */ - TFW_FSM_HTTP, - TFW_FSM_HTTPS, + TFW_FSM_HTTP = 0, + TFW_FSM_HTTPS = 1, + /* h2c isn't supported, so HTTP/2 is always HTTPS */ + TFW_FSM_H2 = 2 | TFW_FSM_HTTPS, + /* Not really a FSM, used for connection hook registration only */ - TFW_FSM_WEBSOCKET, + TFW_FSM_WEBSOCKET = 4, TFW_FSM_WS = TFW_FSM_WEBSOCKET | TFW_FSM_HTTP, TFW_FSM_WSS = TFW_FSM_WEBSOCKET | TFW_FSM_HTTPS, diff --git a/fw/http.c b/fw/http.c index 92cdec27b..99ec338c6 100644 --- a/fw/http.c +++ b/fw/http.c @@ -2538,7 +2538,7 @@ tfw_http_conn_msg_alloc(TfwConn *conn, TfwStream *stream) else tfw_http_init_parser_resp((TfwHttpResp *)hm); - if (TFW_CONN_H2(conn)) { + if (TFW_FSM_TYPE(conn->proto.type) == TFW_FSM_H2) { TfwHttpReq *req = (TfwHttpReq *)hm; if(!(req->pit.pool = __tfw_pool_new(0))) @@ -2739,7 +2739,7 @@ static void tfw_http_resp_terminate(TfwHttpMsg *hm); static void tfw_http_conn_drop(TfwConn *conn) { - bool h2_mode = TFW_CONN_H2(conn); + bool h2_mode = TFW_FSM_TYPE(conn->proto.type) == TFW_FSM_H2; T_DBG2("%s: conn=[%p]\n", __func__, conn); @@ -6545,14 +6545,15 @@ tfw_http_msg_process_generic(TfwConn *conn, TfwStream *stream, int tfw_http_msg_process(TfwConn *conn, struct sk_buff *skb) { - int r; TfwStream *stream = &((TfwConn *)conn)->stream; - r = TFW_CONN_H2(conn) - ? tfw_h2_frame_process(conn, skb) - : tfw_http_msg_process_generic(conn, stream, skb); + WARN_ON_ONCE(tfw_tls_context(c)->alpn_chosen + && tfw_tls_context(c)->alpn_chosen->id == TTLS_ALPN_ID_HTTP2 + && TFW_FSM_TYPE(conn->proto.type) != TFW_FSM_H2); - return r; + if (TFW_FSM_TYPE(conn->proto.type) == TFW_FSM_H2) + return tfw_h2_frame_process(conn, skb); + return tfw_http_msg_process_generic(conn, stream, skb); } /** diff --git a/fw/sock.c b/fw/sock.c index a6f89c755..fa9311ff5 100644 --- a/fw/sock.c +++ b/fw/sock.c @@ -1077,13 +1077,6 @@ ss_proto_init(SsProto *proto, const SsHooks *hooks, int type) } EXPORT_SYMBOL(ss_proto_init); -void -ss_proto_inherit(const SsProto *parent, SsProto *child, int child_type) -{ - *child = *parent; - child->type |= child_type; -} - /** * Make data socket serviced by synchronous sockets. * diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index 6cafc839d..da7e58cef 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -190,7 +190,8 @@ tfw_sock_clnt_new(struct sock *sk) goto err_client; } - ss_proto_inherit(listen_sock_proto, &conn->proto, Conn_Clnt); + ss_proto_inherit(listen_sock_proto, &conn->proto); + BUG_ON(!(conn->proto.type & Conn_Clnt)); conn->destructor = (void *)tfw_cli_conn_release; @@ -382,10 +383,23 @@ static int tfw_listen_sock_add(const TfwAddr *addr, int type) { TfwListenSock *ls; - - /* Check for supported types */ - if (!(type == TFW_FSM_HTTP || type == TFW_FSM_HTTPS)) + const SsHooks *shooks; + + switch (type) { + case TFW_FSM_HTTP: + shooks = &tfw_sock_http_clnt_ss_hooks; + break; + case TFW_FSM_HTTPS: + case TFW_FSM_h2: + /* + * We call the same TLS hooks before generic HTTP processing + * for both the HTTP/1 and HTTP/2. + */ + shooks = &tfw_sock_tls_clnt_ss_hooks; + break; + default: return -EINVAL; + } /* Is there such an address on the list already? */ list_for_each_entry(ls, &tfw_listen_socks_reconf, list) { @@ -400,13 +414,7 @@ tfw_listen_sock_add(const TfwAddr *addr, int type) if (!ls) return -ENOMEM; - if (type == TFW_FSM_HTTP) - ss_proto_init(&ls->proto, &tfw_sock_http_clnt_ss_hooks, - Conn_HttpClnt); - else if (type == TFW_FSM_HTTPS) - ss_proto_init(&ls->proto, &tfw_sock_tls_clnt_ss_hooks, - Conn_HttpsClnt); - + ss_proto_init(&ls->proto, shooks, Conn_Clnt | type); list_add(&ls->list, &tfw_listen_socks_reconf); ls->addr = *addr; @@ -508,8 +516,7 @@ tfw_sock_check_lst(TfwServer *srv) static int tfw_cfgop_listen(TfwCfgSpec *cs, TfwCfgEntry *ce) { - int r, type = TFW_FSM_HTTP; - int port; + int r, port, type = TFW_FSM_HTTP; TfwAddr addr; const char *in_str = NULL; @@ -541,6 +548,7 @@ tfw_cfgop_listen(TfwCfgSpec *cs, TfwCfgEntry *ce) if (r) goto parse_err; + /* Plain HTTP/1 is the default listening socket. */ if (!ce->attr_n) goto done; @@ -551,8 +559,8 @@ tfw_cfgop_listen(TfwCfgSpec *cs, TfwCfgEntry *ce) if (!strcasecmp(in_str, "http")) goto done; - type = TFW_FSM_HTTPS; - if (!tfw_tls_cfg_alpn_protos(in_str)) + type = tfw_tls_cfg_alpn_protos(in_str); + if (type > 0) goto done; parse_err: @@ -561,7 +569,7 @@ tfw_cfgop_listen(TfwCfgSpec *cs, TfwCfgEntry *ce) return -EINVAL; done: - if (type == TFW_FSM_HTTPS) + if (type & TFW_FSM_HTTPS) tfw_tls_cfg_require(); return tfw_listen_sock_add(&addr, type); } diff --git a/fw/sync_socket.h b/fw/sync_socket.h index 5b4156fd1..406bf0e55 100644 --- a/fw/sync_socket.h +++ b/fw/sync_socket.h @@ -105,6 +105,12 @@ ss_sock_live(struct sock *sk) return sk->sk_state == TCP_ESTABLISHED; } +static inline void +ss_proto_inherit(const SsProto *parent, SsProto *child) +{ + *child = *parent; +} + /* Synchronous operation required. */ #define SS_F_SYNC 0x01 /* Keep SKBs (use clones) on sending. */ @@ -122,7 +128,6 @@ int ss_hooks_register(SsHooks* hooks); void ss_hooks_unregister(SsHooks* hooks); void ss_proto_init(SsProto *proto, const SsHooks *hooks, int type); -void ss_proto_inherit(const SsProto *parent, SsProto *child, int child_type); void ss_set_callbacks(struct sock *sk); void ss_set_listen(struct sock *sk); int ss_send(struct sock *sk, struct sk_buff **skb_head, int flags); diff --git a/fw/tls.c b/fw/tls.c index c2edd9767..4d2d81f78 100644 --- a/fw/tls.c +++ b/fw/tls.c @@ -655,11 +655,12 @@ static int tfw_tls_conn_init(TfwConn *c) { int r; - TlsCtx *tls = tfw_tls_context(c); - TfwH2Ctx *h2 = tfw_h2_context(c); + TlsCtx *tls; T_DBG2("%s: conn=[%p]\n", __func__, c); + BUG_ON(!(c->proto.type & TFW_FSM_HTTPS)); + tls = tfw_tls_context(c); if ((r = ttls_ctx_init(tls, &tfw_tls.cfg))) { T_ERR("TLS (%pK) setup failed (%x)\n", tls, -r); return -EINVAL; @@ -668,8 +669,9 @@ tfw_tls_conn_init(TfwConn *c) if (tfw_conn_hook_call(TFW_FSM_HTTP, c, conn_init)) return -EINVAL; - if ((r = tfw_h2_context_init(h2))) - return r; + if (TFW_FSM_TYPE(c->proto.type) == TFW_FSM_H2) + if ((r = tfw_h2_context_init(tfw_h2_context(c)))) + return r; /* * We never hook TLS connections in GFSM, but initialize it with 0 state @@ -905,6 +907,22 @@ ttls_cli_id(TlsCtx *tls, unsigned long hash) sizeof(TfwAddr), hash); } +bool +tfw_tls_alpn_match(TlsCtx *tls, ttls_alpn_proto *alpn) +{ + int sk_proto = ((SsProto *)tls->sk->sk_user_data)->type; + + if (TFW_FSM_TYPE(sk_proto) == TFW_FSM_H2 + && alpn->id == TTLS_ALPN_ID_HTTP2) + return true; + + if (TFW_FSM_TYPE(sk_proto) == TFW_FSM_HTTPS + && alpn->id == TTLS_ALPN_ID_HTTP1) + return true; + + return false; +} + /* * ------------------------------------------------------------------------ * TLS library configuration. @@ -977,7 +995,7 @@ tfw_tls_cfg_alpn_protos(const char *cfg_str) /* Prefer HTTP/2 over HTTP/1. */ switch (proto0->id) { case TTLS_ALPN_ID_HTTP2: - return 0; + return TFW_FSM_H2; case TTLS_ALPN_ID_HTTP1: *proto1 = *proto0; fallthrough; @@ -985,7 +1003,7 @@ tfw_tls_cfg_alpn_protos(const char *cfg_str) proto0->id = TTLS_ALPN_ID_HTTP2; proto0->name = TTLS_ALPN_HTTP2; proto0->len = sizeof(TTLS_ALPN_HTTP2) - 1; - return 0; + return TFW_FSM_H2; } } @@ -995,14 +1013,14 @@ tfw_tls_cfg_alpn_protos(const char *cfg_str) proto1->id = TTLS_ALPN_ID_HTTP1; proto1->name = TTLS_ALPN_HTTP1; proto1->len = sizeof(TTLS_ALPN_HTTP1) - 1; - return 0; + return TFW_FSM_HTTPS; case TTLS_ALPN_ID_HTTP1: - return 0; + return TFW_FSM_HTTPS; case 0: proto0->id = TTLS_ALPN_ID_HTTP1; proto0->name = TTLS_ALPN_HTTP1; proto0->len = sizeof(TTLS_ALPN_HTTP1) - 1; - return 0; + return TFW_FSM_HTTPS; } } @@ -1072,7 +1090,7 @@ tfw_tls_init(void) return -EINVAL; ttls_register_callbacks(tfw_tls_send, tfw_tls_sni, frang_tls_handler, - ttls_cli_id); + ttls_cli_id, tfw_tls_alpn_match); if ((r = tfw_h2_init())) goto err_h2; diff --git a/tls/tls_srv.c b/tls/tls_srv.c index 986d46bb7..f30366900 100644 --- a/tls/tls_srv.c +++ b/tls/tls_srv.c @@ -32,6 +32,7 @@ ttls_sni_cb_t *ttls_sni_cb; ttls_hs_over_cb_t *ttls_hs_over_cb; +ttls_alpn_match_t *alpn_match_cb; static int ttls_parse_servername_ext(TlsCtx *tls, const unsigned char *buf, size_t len) @@ -379,7 +380,9 @@ ttls_parse_alpn_ext(TlsCtx *tls, const unsigned char *buf, size_t len) our = &alpn_list[i]; for (theirs = start; theirs != end; theirs += cur_len) { cur_len = *theirs++; - if (ttls_alpn_ext_eq(our, theirs, cur_len)) { + if (ttls_alpn_ext_eq(our, theirs, cur_len) + && alpn_match_cb(tls, our)) + { tls->alpn_chosen = our; return 0; } diff --git a/tls/ttls.c b/tls/ttls.c index 33b724c79..c2164f901 100644 --- a/tls/ttls.c +++ b/tls/ttls.c @@ -52,6 +52,7 @@ static ttls_send_cb_t *ttls_send_cb; extern ttls_sni_cb_t *ttls_sni_cb; extern ttls_hs_over_cb_t *ttls_hs_over_cb; extern ttls_cli_id_t *ttls_cli_id_cb; +extern ttls_alpn_match_t *ttls_alpn_match_cb; static inline size_t ttls_max_ciphertext_len(const TlsXfrm *xfrm) @@ -236,12 +237,14 @@ ttls_skb_extract_alert(TlsIOCtx *io, TlsXfrm *xfrm) */ void ttls_register_callbacks(ttls_send_cb_t *send_cb, ttls_sni_cb_t *sni_cb, - ttls_hs_over_cb_t *hs_over_cb, ttls_cli_id_t *cli_id_cb) + ttls_hs_over_cb_t *hs_over_cb, ttls_cli_id_t *cli_id_cb, + ttls_alpn_match_t *alpn_match_cb) { ttls_send_cb = send_cb; ttls_sni_cb = sni_cb; ttls_hs_over_cb = hs_over_cb; ttls_cli_id_cb = cli_id_cb; + ttls_alpn_match_cb = alpn_match_cb; } EXPORT_SYMBOL(ttls_register_callbacks); diff --git a/tls/ttls.h b/tls/ttls.h index da30bbc38..c70d90a9a 100644 --- a/tls/ttls.h +++ b/tls/ttls.h @@ -565,6 +565,7 @@ typedef struct ttls_context { typedef int ttls_send_cb_t(TlsCtx *tls, struct sg_table *sgt); typedef int ttls_sni_cb_t(TlsCtx *tls, const unsigned char *data, size_t len); typedef unsigned long ttls_cli_id_t(TlsCtx *tls, unsigned long hash); +typedef bool ttls_alpn_match_t(TlsCtx *tls, ttls_alpn_proto *alpn); enum { TTLS_HS_CB_FINISHED_NEW, From 0db6afb882984334689daf3d7a193655f196d0fd Mon Sep 17 00:00:00 2001 From: Alexander K Date: Sun, 26 Jun 2022 23:46:55 +0300 Subject: [PATCH 03/26] Fix typos for the previous commit (TO BE SQUASHED) --- fw/http.c | 5 +++-- fw/sock_clnt.c | 2 +- fw/tls.c | 2 +- tls/tls_srv.c | 4 ++-- tls/ttls.h | 6 ++++-- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/fw/http.c b/fw/http.c index 99ec338c6..90a2fea23 100644 --- a/fw/http.c +++ b/fw/http.c @@ -6547,8 +6547,9 @@ tfw_http_msg_process(TfwConn *conn, struct sk_buff *skb) { TfwStream *stream = &((TfwConn *)conn)->stream; - WARN_ON_ONCE(tfw_tls_context(c)->alpn_chosen - && tfw_tls_context(c)->alpn_chosen->id == TTLS_ALPN_ID_HTTP2 + WARN_ON_ONCE(tfw_tls_context(conn)->alpn_chosen + && tfw_tls_context(conn)->alpn_chosen->id + == TTLS_ALPN_ID_HTTP2 && TFW_FSM_TYPE(conn->proto.type) != TFW_FSM_H2); if (TFW_FSM_TYPE(conn->proto.type) == TFW_FSM_H2) diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index da7e58cef..e84d38971 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -390,7 +390,7 @@ tfw_listen_sock_add(const TfwAddr *addr, int type) shooks = &tfw_sock_http_clnt_ss_hooks; break; case TFW_FSM_HTTPS: - case TFW_FSM_h2: + case TFW_FSM_H2: /* * We call the same TLS hooks before generic HTTP processing * for both the HTTP/1 and HTTP/2. diff --git a/fw/tls.c b/fw/tls.c index 4d2d81f78..150f28839 100644 --- a/fw/tls.c +++ b/fw/tls.c @@ -908,7 +908,7 @@ ttls_cli_id(TlsCtx *tls, unsigned long hash) } bool -tfw_tls_alpn_match(TlsCtx *tls, ttls_alpn_proto *alpn) +tfw_tls_alpn_match(const TlsCtx *tls, const ttls_alpn_proto *alpn) { int sk_proto = ((SsProto *)tls->sk->sk_user_data)->type; diff --git a/tls/tls_srv.c b/tls/tls_srv.c index f30366900..9b1acbc98 100644 --- a/tls/tls_srv.c +++ b/tls/tls_srv.c @@ -32,7 +32,7 @@ ttls_sni_cb_t *ttls_sni_cb; ttls_hs_over_cb_t *ttls_hs_over_cb; -ttls_alpn_match_t *alpn_match_cb; +ttls_alpn_match_t *ttls_alpn_match_cb; static int ttls_parse_servername_ext(TlsCtx *tls, const unsigned char *buf, size_t len) @@ -381,7 +381,7 @@ ttls_parse_alpn_ext(TlsCtx *tls, const unsigned char *buf, size_t len) for (theirs = start; theirs != end; theirs += cur_len) { cur_len = *theirs++; if (ttls_alpn_ext_eq(our, theirs, cur_len) - && alpn_match_cb(tls, our)) + && ttls_alpn_match_cb(tls, our)) { tls->alpn_chosen = our; return 0; diff --git a/tls/ttls.h b/tls/ttls.h index c70d90a9a..ab4e702c6 100644 --- a/tls/ttls.h +++ b/tls/ttls.h @@ -565,7 +565,7 @@ typedef struct ttls_context { typedef int ttls_send_cb_t(TlsCtx *tls, struct sg_table *sgt); typedef int ttls_sni_cb_t(TlsCtx *tls, const unsigned char *data, size_t len); typedef unsigned long ttls_cli_id_t(TlsCtx *tls, unsigned long hash); -typedef bool ttls_alpn_match_t(TlsCtx *tls, ttls_alpn_proto *alpn); +typedef bool ttls_alpn_match_t(const TlsCtx *tls, const ttls_alpn_proto *alpn); enum { TTLS_HS_CB_FINISHED_NEW, @@ -581,7 +581,9 @@ void ttls_write_hshdr(unsigned char type, unsigned char *buf, unsigned short len); void *ttls_alloc_crypto_req(unsigned int extra_size, unsigned int *rsz); void ttls_register_callbacks(ttls_send_cb_t *send_cb, ttls_sni_cb_t *sni_cb, - ttls_hs_over_cb_t *hs_over_cb, ttls_cli_id_t *cli_id_cb); + ttls_hs_over_cb_t *hs_over_cb, + ttls_cli_id_t *cli_id_cb, + ttls_alpn_match_t *alpn_match_cb); const char *ttls_get_ciphersuite_name(const int ciphersuite_id); From 71ebe126ff360ebd3a45071c9bce544a755e2d1a Mon Sep 17 00:00:00 2001 From: Alexander K Date: Wed, 6 Jul 2022 19:24:12 +0300 Subject: [PATCH 04/26] Use separate memory caches for HTTPS and H2 connections. --- fw/http.c | 2 +- fw/sock_clnt.c | 47 +++++++++++++++++++++++++++++------------------ fw/tls.c | 1 + 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/fw/http.c b/fw/http.c index 90a2fea23..1247ea51c 100644 --- a/fw/http.c +++ b/fw/http.c @@ -6547,7 +6547,7 @@ tfw_http_msg_process(TfwConn *conn, struct sk_buff *skb) { TfwStream *stream = &((TfwConn *)conn)->stream; - WARN_ON_ONCE(tfw_tls_context(conn)->alpn_chosen + WARN_ON_ONCE(TFW_CONN_TLS(conn) && tfw_tls_context(conn)->alpn_chosen && tfw_tls_context(conn)->alpn_chosen->id == TTLS_ALPN_ID_HTTP2 && TFW_FSM_TYPE(conn->proto.type) != TFW_FSM_H2); diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index e84d38971..8cd788d5b 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -41,22 +41,24 @@ * ------------------------------------------------------------------------ */ -static struct kmem_cache *tfw_cli_conn_cache; +static struct kmem_cache *tfw_h1_conn_cache; +static struct kmem_cache *tfw_https_conn_cache; static struct kmem_cache *tfw_h2_conn_cache; static int tfw_cli_cfg_ka_timeout = -1; static inline struct kmem_cache * tfw_cli_cache(int type) { - /* - * Currently any secure (TLS) connection is considered as HTTP/2 - * connection, since we don't have any business with plain TLS. - * - * TODO #1422 this should be fixed since we still need HTTP/1 as - * more applicable protocol for service management. - */ - return type & TFW_FSM_HTTPS ? - tfw_h2_conn_cache : tfw_cli_conn_cache; + switch (TFW_FSM_TYPE(type)) { + case TFW_FSM_H2: + return tfw_h2_conn_cache; + case TFW_FSM_HTTPS: + return tfw_https_conn_cache; + case TFW_FSM_HTTP: + return tfw_h1_conn_cache; + default: + BUG(); + } } static void @@ -804,19 +806,27 @@ tfw_sock_clnt_init(void) */ BUILD_BUG_ON(Conn_Stop & (Conn_Clnt | Conn_Srv | TFW_FSM_HTTP | TFW_FSM_HTTPS)); - BUG_ON(tfw_cli_conn_cache); + BUG_ON(tfw_h1_conn_cache); + BUG_ON(tfw_https_conn_cache); BUG_ON(tfw_h2_conn_cache); - tfw_cli_conn_cache = kmem_cache_create("tfw_cli_conn_cache", - sizeof(TfwCliConn), 0, 0, NULL); - if (!tfw_cli_conn_cache) + tfw_h1_conn_cache = kmem_cache_create("tfw_h1_conn_cache", + sizeof(TfwCliConn), 0, 0, NULL); + if (!tfw_h1_conn_cache) return -ENOMEM; - tfw_h2_conn_cache = kmem_cache_create("tfw_h2_conn_cache", - sizeof(TfwH2Conn), 0, 0, NULL); + tfw_https_conn_cache = kmem_cache_create("tfw_https_conn_cache", + sizeof(TfwTlsConn), 0, 0, NULL); + if (!tfw_https_conn_cache) { + kmem_cache_destroy(tfw_https_conn_cache); + return -ENOMEM; + } + tfw_h2_conn_cache = kmem_cache_create("tfw_h2_conn_cache", + sizeof(TfwH2Conn), 0, 0, NULL); if (!tfw_h2_conn_cache) { - kmem_cache_destroy(tfw_cli_conn_cache); + kmem_cache_destroy(tfw_https_conn_cache); + kmem_cache_destroy(tfw_h1_conn_cache); return -ENOMEM; } @@ -830,5 +840,6 @@ tfw_sock_clnt_exit(void) { tfw_mod_unregister(&tfw_sock_clnt_mod); kmem_cache_destroy(tfw_h2_conn_cache); - kmem_cache_destroy(tfw_cli_conn_cache); + kmem_cache_destroy(tfw_https_conn_cache); + kmem_cache_destroy(tfw_h1_conn_cache); } diff --git a/fw/tls.c b/fw/tls.c index 150f28839..7e4cf99e3 100644 --- a/fw/tls.c +++ b/fw/tls.c @@ -1096,6 +1096,7 @@ tfw_tls_init(void) goto err_h2; tfw_connection_hooks_register(&tls_conn_hooks, TFW_FSM_HTTPS); + tfw_connection_hooks_register(&tls_conn_hooks, TFW_FSM_H2); tfw_mod_register(&tfw_tls_mod); return 0; From c89698d2617be0d31cd6611864b4d9023dd6b0e4 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Thu, 7 Jul 2022 03:31:11 +0300 Subject: [PATCH 05/26] Don't try to clear H2 context on HTTPS connections. Disable annoying warning on inability to send TLS alerts on connections with handshakes in progress. Add comments for ss_active_guard_enter() about the need for the double checking. Add more information about connections counters state to the warning message about pending client connections. --- fw/sock.c | 23 ++++++++++++++++++++--- fw/tls.c | 10 +++++++--- tls/ttls.c | 3 ++- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/fw/sock.c b/fw/sock.c index fa9311ff5..a7b996123 100644 --- a/fw/sock.c +++ b/fw/sock.c @@ -156,9 +156,20 @@ ss_active_guard_enter(unsigned long val) { atomic64_t *acnt = this_cpu_ptr(&__ss_act_cnt); + /* + * Don't race with ss_wait_newconn() and ss_synchronize() on the __ss_act_cnt + * if we commited to shutdown. + */ if (unlikely(!READ_ONCE(__ss_active))) return SS_BAD; + atomic64_add(val, acnt); + + /* + * If ss_stop() and the whole ss_wait_newconn() or ss_synchronize() were + * called between __ss_active check above and the addition, then revert + * the addtion on the second check. + */ if (unlikely(!READ_ONCE(__ss_active))) { atomic64_sub(val, acnt); return SS_BAD; @@ -1569,11 +1580,17 @@ ss_synchronize(void) for_each_online_cpu(cpu) { TfwRBQueue *wq = &per_cpu(si_wq, cpu); SsCloseBacklog *cb; + atomic64_t *acm; + cb = &per_cpu(close_backlog, cpu); + acm = &per_cpu(__ss_act_cnt, cpu); T_WARN(" cpu %d(%d), backlog size %lu," - " work queue size %d\n", - cpu, smp_processor_id(), cb->size, - tfw_wq_size(wq)); + " active connections mask %#lx," + " cntwork queue size %d\n", + cpu, smp_processor_id(), + cb->size, + (unsigned long)atomic64_read(acm), + tfw_wq_size(wq)); } T_WARN("Memory leakage is possible\n"); return; diff --git a/fw/tls.c b/fw/tls.c index 7e4cf99e3..0f6aa772a 100644 --- a/fw/tls.c +++ b/fw/tls.c @@ -620,7 +620,8 @@ tfw_tls_conn_dtor(void *c) struct sk_buff *skb; TlsCtx *tls = tfw_tls_context(c); - tfw_h2_context_clear(tfw_h2_context(c)); + if (TFW_FSM_TYPE(((TfwConn *)c)->proto.type) == TFW_FSM_H2) + tfw_h2_context_clear(tfw_h2_context(c)); if (tls) { while ((skb = ss_skb_dequeue(&tls->io_in.skb_list))) @@ -700,10 +701,13 @@ tfw_tls_conn_close(TfwConn *c, bool sync) * transmission. Otherwise if we have to close the socket * and can not write to the socket, then there is no other way than * skip the alert and just close the socket. + * + * That's just OK if we're closing a TCP connection during TLS handshake. */ if (r) { - T_WARN_ADDR("Close TCP socket w/o sending alert to the peer", - &c->peer->addr, TFW_NO_PORT); + if (r != -EPROTO) + T_WARN_ADDR("Close TCP socket w/o sending alert to" + " the peer", &c->peer->addr, TFW_NO_PORT); r = ss_close(c->sk, sync ? SS_F_SYNC : 0); } diff --git a/tls/ttls.c b/tls/ttls.c index c2164f901..06614f84b 100644 --- a/tls/ttls.c +++ b/tls/ttls.c @@ -2279,7 +2279,8 @@ ttls_close_notify(TlsCtx *tls) T_DBG("write close notify\n"); if (tls->state != TTLS_HANDSHAKE_OVER) - return -EINVAL; + return -EPROTO; + return ttls_send_alert(tls, TTLS_ALERT_LEVEL_WARNING, TTLS_ALERT_MSG_CLOSE_NOTIFY); } From 0e6efe334bd647a2813a645cc0f52c1355f8e56c Mon Sep 17 00:00:00 2001 From: Alexander K Date: Thu, 14 Jul 2022 03:46:30 +0300 Subject: [PATCH 06/26] Refactoring: 1. remove stress.[ch] which never been used and we still don't know whether we need then for the QoS. 2. Remove classifier - there is only one module (http_limits) using it and there is no sense to play with RCU and indirect calls 3. Replace SS_CALL_GUARD_EXIT() with an (inlined) function and remove unused SS_CALL_GUARD_ENTER(). Add assertion that a socket is locked, when it enters frang_conn_limit() from tcp_v4_syn_recv_sock() (the socket is locked by tcp_create_openreq_child() -> inet_csk_clone_lock(). --- fw/filter.c | 24 +----- fw/http_limits.c | 218 +++++++++++++---------------------------------- fw/http_limits.h | 56 ------------ fw/sock.c | 33 +++---- fw/stress.c | 83 ------------------ fw/stress.h | 63 -------------- 6 files changed, 73 insertions(+), 404 deletions(-) delete mode 100644 fw/stress.c delete mode 100644 fw/stress.h diff --git a/fw/filter.c b/fw/filter.c index 15825e47f..7294bbc8a 100644 --- a/fw/filter.c +++ b/fw/filter.c @@ -148,7 +148,6 @@ static unsigned int tfw_ipv4_nf_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - int r; const struct iphdr *ih; struct in6_addr addr6; @@ -161,16 +160,7 @@ tfw_ipv4_nf_hook(void *priv, struct sk_buff *skb, if (tfw_filter_check_ip(&addr6) == TFW_BLOCK) return NF_DROP; - /* Check classifiers for Layer 3. */ - r = tfw_classify_ipv4(skb); - switch (r) { - case TFW_PASS: - return NF_ACCEPT; - case TFW_POSTPONE: - return NF_STOLEN; - } - - return NF_DROP; + return NF_ACCEPT; } static u8 * @@ -233,7 +223,6 @@ static unsigned int tfw_ipv6_nf_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - int r; struct ipv6hdr *ih; ih = __ipv6_hdr_check(skb); @@ -243,16 +232,7 @@ tfw_ipv6_nf_hook(void *priv, struct sk_buff *skb, if (tfw_filter_check_ip(&ih->saddr) == TFW_BLOCK) return NF_DROP; - /* Check classifiers for Layer 3. */ - r = tfw_classify_ipv6(skb); - switch (r) { - case TFW_PASS: - return NF_ACCEPT; - case TFW_POSTPONE: - return NF_STOLEN; - } - - return NF_DROP; + return NF_ACCEPT; } static struct nf_hook_ops tfw_nf_ops[] __read_mostly = { diff --git a/fw/http_limits.c b/fw/http_limits.c index a2b8a6237..f0525cc03 100644 --- a/fw/http_limits.c +++ b/fw/http_limits.c @@ -43,157 +43,6 @@ #include "hash.h" #include "http_match.h" -/* - * ------------------------------------------------------------------------ - * Generic classifier functionality. - * ------------------------------------------------------------------------ - */ - -static DECLARE_BITMAP(tfw_inports, 65536) __read_mostly; - -static TfwClassifier __rcu *classifier = NULL; - -/** - * Shrink client connections hash and/or reduce QoS for blocked clients to - * lower back-end servers or local system load. - */ -void -tfw_classify_shrink(void) -{ - /* TODO: delete a connection from the LRU */ -} - -int -tfw_classify_ipv4(struct sk_buff *skb) -{ - int r; - TfwClassifier *clfr; - - rcu_read_lock(); - - clfr = rcu_dereference(classifier); - r = (clfr && clfr->classify_ipv4) - ? clfr->classify_ipv4(skb) - : TFW_PASS; - - rcu_read_unlock(); - - return r; -} - -int -tfw_classify_ipv6(struct sk_buff *skb) -{ - int r; - TfwClassifier *clfr; - - rcu_read_lock(); - - clfr = rcu_dereference(classifier); - r = (clfr && clfr->classify_ipv6) - ? clfr->classify_ipv6(skb) - : TFW_PASS; - - rcu_read_unlock(); - - return r; -} - -void -tfw_classifier_add_inport(__be16 port) -{ - set_bit(port, tfw_inports); -} - -void -tfw_classifier_remove_inport(__be16 port) -{ - clear_bit(port, tfw_inports); -} - -void -tfw_classifier_cleanup_inport(void) -{ - bitmap_zero(tfw_inports, 65536); -} - -static int -tfw_classify_conn_estab(struct sock *sk) -{ - int i; - unsigned short sport = tfw_addr_get_sk_sport(sk); - TfwClassifier *clfr; - - if (test_bit(sport, tfw_inports)) - goto ours; - - return TFW_PASS; - -ours: - rcu_read_lock(); - - clfr = rcu_dereference(classifier); - i = (clfr && clfr->classify_conn_estab) - ? clfr->classify_conn_estab(sk) - : TFW_PASS; - - rcu_read_unlock(); - - return i; -} - -static void -tfw_classify_conn_close(struct sock *sk) -{ - TfwClassifier *clfr = rcu_dereference(classifier); - - if (clfr && clfr->classify_conn_close) - clfr->classify_conn_close(sk); -} - -/** - * Called from sk_filter() called from tcp_v4_rcv() and tcp_v6_rcv(), - * i.e. when IP fragments are already assembled and we can process TCP. - */ -static int -tfw_classify_tcp(struct sock *sk, struct sk_buff *skb) -{ - struct tcphdr *th = tcp_hdr(skb); - TfwClassifier *clfr = rcu_dereference(classifier); - - return clfr && clfr->classify_tcp ? clfr->classify_tcp(th) : TFW_PASS; -} - -/* - * tfw_classifier_register() and tfw_classifier_unregister() - * are called at Tempesta start/stop time. The execution is - * serialized with a mutex. There's no need for additional - * protection of rcu_assign_pointer() from concurrent use. - */ -void -tfw_classifier_register(TfwClassifier *mod) -{ - T_DBG("Registering new classifier: %s\n", mod->name); - - BUG_ON(classifier); - rcu_assign_pointer(classifier, mod); -} - -void -tfw_classifier_unregister(void) -{ - T_DBG("Un-registering classifier: %s\n", classifier->name); - - rcu_assign_pointer(classifier, NULL); - synchronize_rcu(); -} - -static TempestaOps tempesta_ops = { - .sk_alloc = tfw_classify_conn_estab, - .sk_free = tfw_classify_conn_close, - .sock_tcp_rcv = tfw_classify_tcp, -}; - /* * ------------------------------------------------------------------------ * Frang classifier - static http limits implementation. @@ -338,6 +187,9 @@ frang_conn_new(struct sock *sk) TfwAddr addr; TfwVhost *dflt_vh = tfw_vhost_lookup_default(); + /* The new socket is allocated by inet_csk_clone_lock(). */ + assert_spin_locked(&sk->sk_lock.slock); + /* * Default vhost configuration stores global frang settings, it's always * available even on reload under heavy load. But the pointer comes @@ -399,7 +251,7 @@ frang_conn_new(struct sock *sk) * Just update current connection count for a user. */ static void -frang_conn_close(struct sock *sk) +tfw_classify_conn_close(struct sock *sk) { FrangAcc *ra = sk->sk_security; @@ -1591,6 +1443,7 @@ frang_tls_handler(TlsCtx *tls, int state) TfwVhost *dflt_vh = tfw_vhost_lookup_default(); int r; + BUG_ON(!ra); // TODO #1643 remove me if (WARN_ON_ONCE(!dflt_vh)) return TFW_BLOCK; @@ -1606,11 +1459,52 @@ frang_tls_handler(TlsCtx *tls, int state) return r; } -static TfwClassifier frang_class_ops = { - .name = "frang", - .classify_conn_estab = frang_conn_new, - .classify_conn_close = frang_conn_close, -}; +/* + * ------------------------------------------------------------------------ + * Generic classifier functionality. + * ------------------------------------------------------------------------ + */ + +static DECLARE_BITMAP(tfw_inports, 65536) __read_mostly; + +void +tfw_classifier_add_inport(__be16 port) +{ + pr_err("AK_DBG 1643: set port %x in bitmap\n", port); // TODO #1643 remove me + set_bit(port, tfw_inports); +} + +void +tfw_classifier_remove_inport(__be16 port) +{ + clear_bit(port, tfw_inports); +} + +void +tfw_classifier_cleanup_inport(void) +{ + bitmap_zero(tfw_inports, 65536); +} + +static int +tfw_classify_conn_estab(struct sock *sk) +{ + if (test_bit(tfw_addr_get_sk_sport(sk), tfw_inports)) + return frang_conn_new(sk); + else BUG_ON(tfw_addr_get_sk_sport(sk) == 0xbb01/*443*/); // TODO #1643 remove me + + return TFW_PASS; +} + +/** + * TODO #488: call from sk_filter() called from tcp_v4_rcv() and tcp_v6_rcv(), + * i.e. when IP fragments are already assembled and we can process TCP. + */ +static int +tfw_classify_tcp(struct sock *sk, struct sk_buff *skb) +{ + return TFW_PASS; +} /* * ------------------------------------------------------------------------ @@ -1704,6 +1598,12 @@ tfw_http_limits_hooks_register(void) return 0; } +static TempestaOps tempesta_ops = { + .sk_alloc = tfw_classify_conn_estab, + .sk_free = tfw_classify_conn_close, + .sock_tcp_rcv = tfw_classify_tcp, +}; + int __init tfw_http_limits_init(void) { @@ -1713,8 +1613,6 @@ tfw_http_limits_init(void) BUILD_BUG_ON((sizeof(FrangAcc) > sizeof(TfwClassifierPrvt))); - tfw_classifier_register(&frang_class_ops); - r = tfw_gfsm_register_fsm(TFW_FSM_FRANG_REQ, frang_http_req_handler); if (r) { T_ERR_NL("frang: can't register request fsm\n"); @@ -1738,7 +1636,6 @@ tfw_http_limits_init(void) err_fsm_resp: tfw_gfsm_unregister_fsm(TFW_FSM_FRANG_REQ); err_fsm: - tfw_classifier_unregister(); tempesta_unregister_ops(&tempesta_ops); return r; } @@ -1751,6 +1648,5 @@ tfw_http_limits_exit(void) tfw_http_limits_hooks_remove(); tfw_gfsm_unregister_fsm(TFW_FSM_FRANG_RESP); tfw_gfsm_unregister_fsm(TFW_FSM_FRANG_REQ); - tfw_classifier_unregister(); tempesta_unregister_ops(&tempesta_ops); } diff --git a/fw/http_limits.h b/fw/http_limits.h index d66d3f0b8..809558778 100644 --- a/fw/http_limits.h +++ b/fw/http_limits.h @@ -43,66 +43,10 @@ typedef struct { char _[TFW_CLASSIFIER_ACCSZ]; } TfwClassifierPrvt; -/* - * Classification module handler. - * - * TODO: - * -- modules should have possibility to register number of classifier callbacks, - * so store the callback in fixed size array, so we can quickly determine which - * callbacks (if either) we need to call. - */ -typedef struct { - char *name; - /* - * Classify a client on network L3 layer. - */ - int (*classify_ipv4)(struct sk_buff *skb); - int (*classify_ipv6)(struct sk_buff *skb); - /* - * Classify TCP segments. - */ - int (*classify_tcp)(struct tcphdr *th); - /* - * Called when a new client connection is established (many TCP SYNs - * can precede an established connection, so it's more efficient to - * handle events for established and closed. - */ - int (*classify_conn_estab)(struct sock *sk); - /* - * Called when a client connection closed. - */ - void (*classify_conn_close)(struct sock *sk); - /* - * TODO called on retransmits to client (e.g. SYN+ACK or data). - */ - int (*classify_tcp_timer_retrans)(void); - /* - * TODO called on sending TCP keep alive segments. - */ - int (*classify_tcp_timer_keepalive)(void); - /* - * TODO called when we choose our window size to report to client. - */ - int (*classify_tcp_window)(void); - /* - * TODO called when peer reported zero window, so we can't send data - * and must send TCP zero window probing segments. - */ - int (*classify_tcp_zwp)(void); -} TfwClassifier; - void tfw_classifier_add_inport(__be16 port); void tfw_classifier_remove_inport(__be16 port); void tfw_classifier_cleanup_inport(void); -void tfw_classify_shrink(void); - -int tfw_classify_ipv4(struct sk_buff *skb); -int tfw_classify_ipv6(struct sk_buff *skb); - -extern void tfw_classifier_register(TfwClassifier *mod); -extern void tfw_classifier_unregister(void); - /* * ------------------------------------------------------------------------ * Frang (static http limits classifier) configuration interface. diff --git a/fw/sock.c b/fw/sock.c index a7b996123..ec91a43a2 100644 --- a/fw/sock.c +++ b/fw/sock.c @@ -2,7 +2,7 @@ * Synchronous Socket API. * * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). - * Copyright (C) 2015-2021 Tempesta Technologies, Inc. + * Copyright (C) 2015-2022 Tempesta Technologies, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by @@ -109,6 +109,10 @@ static const char *ss_statename[] = { * calls: the shutdown process must wait until all the calls finished and * no new calls can be executed. * + * __ss_act_cnt is per-CPU, but any given connection can increase it on one CPU + * and decrease on another - that's fine we do use only the sum of all per-CPU + * values. + * * However, softirqs can call SS down- or upcall any time. Moreover, there could * be an ingress packet for some Tempesta's socket and it initiates new * Tempesta's calls in softirq. So to guarantee shutdown process convergence we @@ -184,21 +188,12 @@ ss_active_guard_exit(unsigned long val) atomic64_sub(val, this_cpu_ptr(&__ss_act_cnt)); } -/** - * Guard for calling connection error/drop callback for each established socket, - * so we guarantee that all upper layer connections are closed. - */ -#define SS_CALL_GUARD_ENTER(cb, sk) \ -({ \ - ss_active_guard_enter(SS_V_ACT_LIVECONN); \ - SS_CALL(cb, sk); \ -}) - -#define SS_CALL_GUARD_EXIT(cb, sk) \ -do { \ - SS_CALL(cb, sk); \ - ss_active_guard_exit(SS_V_ACT_LIVECONN); \ -} while (0) +static void +__ss_conn_drop_guard_exit(struct sock *sk) +{ + SS_CALL(connection_drop, sk); + ss_active_guard_exit(SS_V_ACT_LIVECONN); +} static void ss_ipi(struct irq_work *work) @@ -684,7 +679,7 @@ static void ss_linkerror(struct sock *sk) { ss_do_close(sk); - SS_CALL_GUARD_EXIT(connection_drop, sk); + __ss_conn_drop_guard_exit(sk); sock_put(sk); /* paired with ss_do_close() */ } @@ -985,7 +980,7 @@ ss_tcp_state_change(struct sock *sk) * and ss_active_guard_enter() there. */ if (!lsk) - SS_CALL_GUARD_EXIT(connection_drop, sk); + __ss_conn_drop_guard_exit(sk); return; } @@ -1388,7 +1383,7 @@ EXPORT_SYMBOL(ss_getpeername); do { \ ss_do_close(sk); \ bh_unlock_sock(sk); \ - SS_CALL_GUARD_EXIT(connection_drop, sk); \ + __ss_conn_drop_guard_exit(sk); \ sock_put(sk); /* paired with ss_do_close() */ \ } while (0) diff --git a/fw/stress.c b/fw/stress.c deleted file mode 100644 index b67d49648..000000000 --- a/fw/stress.c +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Tempesta FW - * - * Interface to stress (local system or back-end server overloading) - * handling modules. - * - * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). - * Copyright (C) 2015 Tempesta Technologies, Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, - * or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include "tempesta_fw.h" -#include "http_limits.h" -#include "stress.h" - -/* TODO replace by RCU list. */ -static LIST_HEAD(stress_handlers); -static DEFINE_RWLOCK(tfw_stress_lock); - -void -tfw_stress_account_srv(/* we need here a packet and connection */) -{ - TfwStress *s; - - read_lock(&tfw_stress_lock); - list_for_each_entry(s, &stress_handlers, st_list) { - if (s->type & TfwStress_Srv) - if (s->account_srv()) - tfw_classify_shrink(); - } - read_unlock(&tfw_stress_lock); -} - -void -tfw_stress_account_sys(void) -{ - TfwStress *s; - - read_lock(&tfw_stress_lock); - list_for_each_entry(s, &stress_handlers, st_list) { - if (s->type & TfwStress_Sys) - if (s->account_sys()) - tfw_classify_shrink(); - } - read_unlock(&tfw_stress_lock); -} - -int -tfw_stress_register(TfwStress *mod) -{ - write_lock(&tfw_stress_lock); - list_add(&mod->st_list, &stress_handlers); - write_unlock(&tfw_stress_lock); - - return 0; -} - -void -tfw_stress_unregister(TfwStress *mod) -{ - TfwStress *s, *tmp; - - write_lock(&tfw_stress_lock); - list_for_each_entry_safe(s, tmp, &stress_handlers, st_list) { - if (s == mod) { - list_del(&s->st_list); - break; - } - } - write_unlock(&tfw_stress_lock); -} diff --git a/fw/stress.h b/fw/stress.h deleted file mode 100644 index 51dc64e5e..000000000 --- a/fw/stress.h +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Tempesta FW - * - * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). - * Copyright (C) 2015-2018 Tempesta Technologies, Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, - * or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#ifndef __TFW_STRESS__ -#define __TFW_STRESS__ - -#include - -#include "tempesta_fw.h" - -typedef enum { - TfwStress_Sys = 1, - TfwStress_Srv = 2, -} TfwStressType; - -/* Stress module handler. */ -typedef struct { - struct list_head st_list; /* list of stress handlers */ - - TfwStressType type; - - /* TODO it seems we can catch the stress events (the both callbacks - * below) just on receiving a response (for account_srv) and receiving - * a request (for account_sys). - */ - - /* - * Account and handle back-end server overload. - * @return true if there is overload and false otherwise. - */ - bool (*account_srv)(void); - /* - * Account and handle local system overload. - * @return true if there is overload and false otherwise. - */ - bool (*account_sys)(void); - -} TfwStress; - -void tfw_stress_account_srv(void); -void tfw_stress_account_sys(void); - -int tfw_stress_register(TfwStress *mod); -void tfw_stress_unregister(TfwStress *mod); - -#endif /* __TFW_STRESS__ */ From 8c4259f77749d79dd5cc4457a176e36b146a9738 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Thu, 14 Jul 2022 04:02:41 +0300 Subject: [PATCH 07/26] Fix crash on an empty account descriptor for HTTP limiting: add a new listening port to the bitmap before moving the socket to listening state - this way we guarantee that HTTP limiting is called on the initialized accounting descriptor. The problem appears on constant system restart under 2000 HTTPS connections from wrk. --- fw/http_limits.c | 3 --- fw/sock_clnt.c | 8 +++++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fw/http_limits.c b/fw/http_limits.c index f0525cc03..70afac92d 100644 --- a/fw/http_limits.c +++ b/fw/http_limits.c @@ -1443,7 +1443,6 @@ frang_tls_handler(TlsCtx *tls, int state) TfwVhost *dflt_vh = tfw_vhost_lookup_default(); int r; - BUG_ON(!ra); // TODO #1643 remove me if (WARN_ON_ONCE(!dflt_vh)) return TFW_BLOCK; @@ -1470,7 +1469,6 @@ static DECLARE_BITMAP(tfw_inports, 65536) __read_mostly; void tfw_classifier_add_inport(__be16 port) { - pr_err("AK_DBG 1643: set port %x in bitmap\n", port); // TODO #1643 remove me set_bit(port, tfw_inports); } @@ -1491,7 +1489,6 @@ tfw_classify_conn_estab(struct sock *sk) { if (test_bit(tfw_addr_get_sk_sport(sk), tfw_inports)) return frang_conn_new(sk); - else BUG_ON(tfw_addr_get_sk_sport(sk) == 0xbb01/*443*/); // TODO #1643 remove me return TFW_PASS; } diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index 8cd788d5b..8f110fa71 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -682,13 +682,19 @@ tfw_sock_clnt_start(void) list_del(&ls->list); list_add(&ls->list, &tfw_listen_socks); + /* + * Paired with tfw_classify_conn_estab(): firstly add the port + * to the bitmap and then move it to the listen state to + * guarantee that the HTTP limits initialization code was called. + */ + tfw_classifier_add_inport(tfw_addr_port(&ls->addr)); + if ((r = tfw_listen_sock_start(ls))) { T_ERR_ADDR("can't start listening on", &ls->addr, TFW_WITH_PORT); goto done; } - tfw_classifier_add_inport(tfw_addr_port(&ls->addr)); listen_socks_sz++; } From b95c9f254749906f79df4f24d0a342390780cf0b Mon Sep 17 00:00:00 2001 From: Alexander K Date: Tue, 9 Aug 2022 21:12:33 +0300 Subject: [PATCH 08/26] Refactoring: there is no need to pass global tfw_mods as a parameter to static functions. Instead, just store and restore the list in the particular test (we need this since there is a session test, which initializes the session module in the list). Remove unnecessary exports. --- fw/main.c | 56 ++++++++++++++++++++++---------------------- fw/t/unit/test_cfg.c | 22 ++++++++++------- 2 files changed, 41 insertions(+), 37 deletions(-) diff --git a/fw/main.c b/fw/main.c index 6d77d4513..5c9d6178c 100644 --- a/fw/main.c +++ b/fw/main.c @@ -90,7 +90,6 @@ tfw_mod_register(TfwMod *mod) list_add_tail(&mod->list, &tfw_mods); write_unlock(&tfw_mods_lock); } -EXPORT_SYMBOL(tfw_mod_register); /** * Remove the @mod from the global list. @@ -105,7 +104,6 @@ tfw_mod_unregister(TfwMod *mod) list_del(&mod->list); write_unlock(&tfw_mods_lock); } -EXPORT_SYMBOL(tfw_mod_unregister); TfwMod * tfw_mod_find(const char *name) @@ -124,8 +122,8 @@ tfw_mod_find(const char *name) return NULL; } -static inline void -tfw_cleanup(struct list_head *mod_list) +static void +tfw_cleanup(void) { /* * Wait until all network activity is stopped @@ -134,24 +132,26 @@ tfw_cleanup(struct list_head *mod_list) if (!tfw_runstate_is_reconfig()) ss_synchronize(); - tfw_cfg_cleanup(mod_list); + tfw_cfg_cleanup(&tfw_mods); if (!tfw_runstate_is_reconfig()) tfw_sg_wait_release(); T_DBG("New configuration is cleaned.\n"); } -static inline void -tfw_mods_stop(struct list_head *mod_list) +static void +tfw_mods_stop(void) { TfwMod *mod; ss_stop(); T_DBG("Stopping all modules...\n"); - MOD_FOR_EACH_REVERSE(mod, mod_list) { + pr_err("AK_DBG Stopping all modules...\n"); + MOD_FOR_EACH_REVERSE(mod, &tfw_mods) { T_DBG2("mod_stop(): %s\n", mod->name); if (mod->stop && mod->started) { + pr_err("AK_DBG mod_stop(): %s\n", mod->name); mod->stop(); mod->started = 0; } @@ -161,20 +161,20 @@ tfw_mods_stop(struct list_head *mod_list) } static void -tfw_stop(struct list_head *mod_list) +tfw_stop(void) { - tfw_mods_stop(mod_list); - tfw_cleanup(mod_list); + tfw_mods_stop(); + tfw_cleanup(); } static int -tfw_mods_cfgstart(struct list_head *mod_list) +tfw_mods_cfgstart(void) { int ret; TfwMod *mod; T_DBG2("Prepare the configuration processing...\n"); - MOD_FOR_EACH(mod, mod_list) { + MOD_FOR_EACH(mod, &tfw_mods) { if (!mod->cfgstart) continue; T_DBG2("mod_cfgstart(): %s\n", mod->name); @@ -190,13 +190,13 @@ tfw_mods_cfgstart(struct list_head *mod_list) } static int -tfw_mods_start(struct list_head *mod_list) +tfw_mods_start(void) { int ret; TfwMod *mod; T_DBG2("starting modules...\n"); - MOD_FOR_EACH(mod, mod_list) { + MOD_FOR_EACH(mod, &tfw_mods) { if (!mod->start) continue; T_DBG2("mod_start(): %s\n", mod->name); @@ -213,13 +213,13 @@ tfw_mods_start(struct list_head *mod_list) } static int -tfw_mods_cfgend(struct list_head *mod_list) +tfw_mods_cfgend(void) { int ret; TfwMod *mod; T_DBG2("Completing the configuration processing...\n"); - MOD_FOR_EACH(mod, mod_list) { + MOD_FOR_EACH(mod, &tfw_mods) { if (!mod->cfgend) continue; T_DBG2("mod_cfgend(): %s\n", mod->name); @@ -235,20 +235,20 @@ tfw_mods_cfgend(struct list_head *mod_list) } static int -tfw_start(struct list_head *mod_list) +tfw_start(void) { int ret; ss_start(); - if ((ret = tfw_mods_cfgstart(mod_list))) + if ((ret = tfw_mods_cfgstart())) goto cleanup; - if ((ret = tfw_cfg_parse(mod_list))) + if ((ret = tfw_cfg_parse(&tfw_mods))) goto cleanup; - if ((ret = tfw_mods_cfgend(mod_list))) + if ((ret = tfw_mods_cfgend())) goto cleanup; - if ((ret = tfw_mods_start(mod_list))) + if ((ret = tfw_mods_start())) goto stop_mods; - tfw_cfg_conclude(mod_list); + tfw_cfg_conclude(&tfw_mods); WRITE_ONCE(tfw_started, true); T_LOG_NL("Tempesta FW is ready\n"); @@ -261,11 +261,11 @@ tfw_start(struct list_head *mod_list) * and Tempesta must be fully stopped and cleared. */ WRITE_ONCE(tfw_reconfig, false); - tfw_mods_stop(mod_list); + tfw_mods_stop(); WRITE_ONCE(tfw_started, false); cleanup: T_WARN_NL("Configuration parsing has failed. Clean up...\n"); - tfw_cleanup(mod_list); + tfw_cleanup(); return ret; } @@ -286,7 +286,7 @@ tfw_ctlfn_state_change(const char *old_state, const char *new_state) T_LOG("Live reconfiguration of Tempesta.\n"); } - r = tfw_start(&tfw_mods); + r = tfw_start(); WRITE_ONCE(tfw_reconfig, false); return r; @@ -298,7 +298,7 @@ tfw_ctlfn_state_change(const char *old_state, const char *new_state) return -EINVAL; } - tfw_stop(&tfw_mods); + tfw_stop(); WRITE_ONCE(tfw_started, false); return 0; @@ -420,7 +420,7 @@ tfw_exit(void) mutex_lock(&tfw_sysctl_mtx); if (READ_ONCE(tfw_started)) { T_WARN_NL("Tempesta FW is still running, shutting down...\n"); - tfw_stop(&tfw_mods); + tfw_stop(); WRITE_ONCE(tfw_started, false); } mutex_unlock(&tfw_sysctl_mtx); diff --git a/fw/t/unit/test_cfg.c b/fw/t/unit/test_cfg.c index bdc5de475..809db28cc 100644 --- a/fw/t/unit/test_cfg.c +++ b/fw/t/unit/test_cfg.c @@ -43,8 +43,6 @@ * our specs may interfere with already existing modules. * Instead, we create a dummy TfwMod{} and pass it to them as if it was real. */ - -static LIST_HEAD(test_tfw_mods); TfwMod test_dummy_mod = { .name = "test_dummy_mod" }; static int @@ -52,22 +50,22 @@ do_parse_cfg(const char *cfg_text, TfwCfgSpec specs[]) { int ret; - BUG_ON(!list_empty(&test_tfw_mods)); + BUG_ON(!list_empty(&tfw_mods)); test_dummy_mod.specs = specs; - list_add(&test_dummy_mod.list, &test_tfw_mods); + list_add(&test_dummy_mod.list, &tfw_mods); - if ((ret = tfw_cfg_parse_mods(cfg_text, &test_tfw_mods))) + if ((ret = tfw_cfg_parse_mods(cfg_text, &tfw_mods))) return ret; - if ((ret = tfw_mods_cfgend(&test_tfw_mods))) + if ((ret = tfw_mods_cfgend())) return ret; - return tfw_mods_start(&test_tfw_mods); + return tfw_mods_start(); } static void do_cleanup_cfg(void) { - BUG_ON(list_empty(&test_tfw_mods)); - tfw_stop(&test_tfw_mods); + BUG_ON(list_empty(&tfw_mods)); + tfw_stop(); list_del(&test_dummy_mod.list); } @@ -1089,6 +1087,10 @@ TEST(tfw_cfg_handle_children, propagates_cleanup_to_nested_specs) TEST_SUITE(cfg) { + LIST_HEAD(tmp_tfw_mods); + tmp_tfw_mods = tfw_mods; + INIT_LIST_HEAD(&tfw_mods); + TEST_RUN(cfg_parser, invokes_specified_handler); TEST_RUN(cfg_parser, allows_repeating_entries); TEST_RUN(cfg_parser, allows_optional_entries); @@ -1112,4 +1114,6 @@ TEST_SUITE(cfg) TEST_RUN(tfw_cfg_set_str, checks_character_set); TEST_RUN(tfw_cfg_handle_children, parses_nested_entries_recursively); TEST_RUN(tfw_cfg_handle_children, propagates_cleanup_to_nested_specs); + + tfw_mods = tmp_tfw_mods; } From 3bb0f5f1365958cec2e7d5c11a592b5e28243674 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Wed, 10 Aug 2022 22:00:07 +0300 Subject: [PATCH 09/26] tfw_mods_stop() calls tfw_sock_clnt_stop() releasing all client sockets and immediately after that tfw_client_stop() closes the clients database, so during ss_synchronize() client connections closing callback tfw_sock_clnt_drop() removes client connection from the dead linked list. This patch count each sock.c users (they must set TfwMod->sock_user) and calls ss_synchronize() exactly when last of them finishes (including interrupted start procedure due to an error in some module). --- fw/main.c | 40 +++++++++++++++++++++++++++------------- fw/sock_clnt.c | 11 ++++++----- fw/sock_srv.c | 1 + fw/tempesta_fw.h | 5 ++++- 4 files changed, 38 insertions(+), 19 deletions(-) diff --git a/fw/main.c b/fw/main.c index 5c9d6178c..df10ddf6f 100644 --- a/fw/main.c +++ b/fw/main.c @@ -47,6 +47,7 @@ size_t exit_hooks_n; DEFINE_MUTEX(tfw_sysctl_mtx); static bool tfw_started = false; static bool tfw_reconfig = false; +static int tfw_ss_users = 0; /* * The global list of all registered modules @@ -125,13 +126,6 @@ tfw_mod_find(const char *name) static void tfw_cleanup(void) { - /* - * Wait until all network activity is stopped - * before data in modules can be cleaned up safely. - */ - if (!tfw_runstate_is_reconfig()) - ss_synchronize(); - tfw_cfg_cleanup(&tfw_mods); if (!tfw_runstate_is_reconfig()) @@ -143,19 +137,36 @@ static void tfw_mods_stop(void) { TfwMod *mod; + bool ss_synced = false; ss_stop(); T_DBG("Stopping all modules...\n"); - pr_err("AK_DBG Stopping all modules...\n"); MOD_FOR_EACH_REVERSE(mod, &tfw_mods) { T_DBG2("mod_stop(): %s\n", mod->name); - if (mod->stop && mod->started) { - pr_err("AK_DBG mod_stop(): %s\n", mod->name); - mod->stop(); - mod->started = 0; - } + if (!mod->stop || !mod->started) + continue; + + mod->stop(); + mod->started = 0; + + tfw_ss_users -= mod->sock_user; + if (ss_synced || tfw_ss_users || tfw_runstate_is_reconfig()) + continue; + /* + * Wait until all network activity is stopped before data in + * modules can be cleaned up safely. We must do this between + * stopping modules using synchronous sockets and modules + * providing data structures for the first modules. + * In particular, we need to stop all networking activity after + * stopping sock_clnt and during the synchronization period the + * client database must provide valid references to stored + * clients. + */ + ss_synchronize(); + ss_synced = false; } + BUG_ON(tfw_ss_users); T_LOG("modules are stopped\n"); } @@ -197,6 +208,8 @@ tfw_mods_start(void) T_DBG2("starting modules...\n"); MOD_FOR_EACH(mod, &tfw_mods) { + BUG_ON(mod->sock_user && (!mod->start || !mod->stop)); + if (!mod->start) continue; T_DBG2("mod_start(): %s\n", mod->name); @@ -206,6 +219,7 @@ tfw_mods_start(void) return ret; } mod->started = 1; + tfw_ss_users += mod->sock_user; } T_DBG("modules are started\n"); diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index 8f110fa71..16d5b42b2 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -790,11 +790,12 @@ static TfwCfgSpec tfw_sock_clnt_specs[] = { }; TfwMod tfw_sock_clnt_mod = { - .name = "sock_clnt", - .cfgend = tfw_sock_clnt_cfgend, - .start = tfw_sock_clnt_start, - .stop = tfw_sock_clnt_stop, - .specs = tfw_sock_clnt_specs, + .name = "sock_clnt", + .cfgend = tfw_sock_clnt_cfgend, + .start = tfw_sock_clnt_start, + .stop = tfw_sock_clnt_stop, + .specs = tfw_sock_clnt_specs, + .sock_user = 1, }; /* diff --git a/fw/sock_srv.c b/fw/sock_srv.c index 47e54bd4b..31fed9f56 100644 --- a/fw/sock_srv.c +++ b/fw/sock_srv.c @@ -2413,6 +2413,7 @@ static TfwMod tfw_sock_srv_mod = { .start = tfw_sock_srv_start, .stop = tfw_sock_srv_stop, .specs = tfw_sock_srv_specs, + .sock_user = 1, }; /* diff --git a/fw/tempesta_fw.h b/fw/tempesta_fw.h index 846b1a2d1..026c87c2d 100644 --- a/fw/tempesta_fw.h +++ b/fw/tempesta_fw.h @@ -79,6 +79,8 @@ * @stop - called to stop a module when Tempesta is stopped; * @specs - array of configuration directives specifications * for a module, terminated by a null element; + * @started - the module current status; + * @sock_user - the module uses sockets service; */ typedef struct { struct list_head list; @@ -89,7 +91,8 @@ typedef struct { int (*start)(void); void (*stop)(void); TfwCfgSpec *specs; - unsigned int started:1; + unsigned int started:1, + sock_user:1; } TfwMod; #define MOD_FOR_EACH(pos, head) \ From 204861f44d398800d9370683f161cfadde12aaa3 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Wed, 10 Aug 2022 23:40:54 +0300 Subject: [PATCH 10/26] We don't use autovectorization any more, so the connection code doesn't need to save and restore FPU context before calling syncronous sockets. --- fw/connection.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/fw/connection.c b/fw/connection.c index b3b96e487..3cd5eee94 100644 --- a/fw/connection.c +++ b/fw/connection.c @@ -69,20 +69,7 @@ tfw_connection_repair(TfwConn *conn) int tfw_connection_close(TfwConn *conn, bool sync) { - int r; - - /* - * This function might be called from process context on Tempesta FW - * start or stop operation or from softirq, so need to save FPU context - * to call autovectorized synchronous sockets code. - */ - kernel_fpu_begin_mask(KFPU_MXCSR); - - r = TFW_CONN_HOOK_CALL(conn, conn_close, sync); - - kernel_fpu_end(); - - return r; + return TFW_CONN_HOOK_CALL(conn, conn_close, sync); } /** From 27d5aed91deca89489919bc408eaff65edcceb74 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Sat, 13 Aug 2022 18:37:48 +0300 Subject: [PATCH 11/26] Remove 'Bad TLS alert' message since it's raised on a peer connection close alert, which is just normal. --- tls/ttls.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tls/ttls.c b/tls/ttls.c index 06614f84b..d468397c0 100644 --- a/tls/ttls.c +++ b/tls/ttls.c @@ -2261,7 +2261,6 @@ ttls_recv(void *tls_data, unsigned char *buf, unsigned int len, unsigned int *re if (io->msgtype == TTLS_MSG_ALERT) { if (!(r = ttls_handle_alert(tls))) return T_OK; - TTLS_WARN(tls, "Bad TLS alert\n"); return T_DROP; } From 7d8b8aa7e8230acfce259aab625583390d553af1 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Sat, 13 Aug 2022 19:04:40 +0300 Subject: [PATCH 12/26] Ignore ctags, csope and vim files --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index 09485a657..3e4cde744 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,12 @@ Module.symvers *~ \#*\# .\#* +*.orig +.*.swp + +# ctags and cscope +cscope.* +tags # eclipse project settings .pydevproject From 136402f1e28ef609f64755b6cfed0e722844d20a Mon Sep 17 00:00:00 2001 From: Alexander K Date: Tue, 16 Aug 2022 20:33:54 +0300 Subject: [PATCH 13/26] Typically we call tfw_sock_clnt_drop() for TLS connections, when tcp_write_xmit() calls tfw_tls_encrypt(). However, the may never happen if the peer didn't announce sufficient receive window. In this case Tempesta shutdown may find live connections and rise "pending active connections for 5s" and leave the connections objects in the memory cache. ss_synchronize() now returns a success value and we abort all the client connections in case of failure and recheck all the counters with ss_synchronize() again. To abort all the connections we extend ss_do_send() with __SS_F_RST and send RST on aborted connections. The connections aborting (TCP RST) is also useful on security events: when we see any malicious activity we should reset connection instead of issuing normal TCP connections closing. For this purpose conn_abort callback was introduced for TfwConnHooks and ss_tcp_data_ready() normally closes or aborts a TCP connection depending on ss_tcp_process_data() return value (SS_BAD and SS_DROP correspondinglY), so several innocent places returning T_DROP/SS_DROP were adjusted to return T_BAD/SS_BAD, most likely not all the places though. tfw_peer_del_conn() were updated to use 1-depth nesting to be called as a callback from tfw_peer_for_each_conn() (the ability isn't used by the current version of connections aborting). Also: - several comments are updated/fixed; - couple of macros replaced with inlined functions - better messaging in ss_synchronize() - some other minor code cleanups --- fw/client.h | 4 +- fw/connection.c | 6 ++ fw/connection.h | 28 +++++++ fw/main.c | 17 +++-- fw/peer.h | 18 +---- fw/sock.c | 173 +++++++++++++++++++++++--------------------- fw/sock_clnt.c | 36 +++++---- fw/sock_srv.c | 5 +- fw/ss_skb.h | 4 +- fw/sync_socket.h | 5 +- fw/t/unit/helpers.c | 10 ++- fw/tls.c | 41 ++++++----- fw/websocket.c | 10 +++ lib/log.h | 4 +- linux-5.10.35.patch | 18 +++-- tls/ttls.c | 15 ++-- 16 files changed, 233 insertions(+), 161 deletions(-) diff --git a/fw/client.h b/fw/client.h index bcab496f2..f33042ccb 100644 --- a/fw/client.h +++ b/fw/client.h @@ -2,7 +2,7 @@ * Tempesta FW * * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). - * Copyright (C) 2015-2018 Tempesta Technologies, Inc. + * Copyright (C) 2015-2022 Tempesta Technologies, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by @@ -43,7 +43,7 @@ int tfw_client_for_each(int (*fn)(void *)); void tfw_client_set_expires_time(unsigned int expires_time); void tfw_cli_conn_release(TfwCliConn *cli_conn); int tfw_cli_conn_send(TfwCliConn *cli_conn, TfwMsg *msg); -int tfw_cli_conn_close_all_sync(TfwClient *cli); +void tfw_cli_abort_all(void); void tfw_tls_connection_lost(TfwConn *conn); diff --git a/fw/connection.c b/fw/connection.c index 3cd5eee94..0e457d164 100644 --- a/fw/connection.c +++ b/fw/connection.c @@ -72,6 +72,12 @@ tfw_connection_close(TfwConn *conn, bool sync) return TFW_CONN_HOOK_CALL(conn, conn_close, sync); } +void +tfw_connection_abort(TfwConn *conn) +{ + TFW_CONN_HOOK_CALL(conn, conn_abort); +} + /** * Publish the "connection is dropped" event via TfwConnHooks. */ diff --git a/fw/connection.h b/fw/connection.h index 23ffc2c4c..e23b61f60 100644 --- a/fw/connection.h +++ b/fw/connection.h @@ -262,6 +262,13 @@ typedef struct { */ int (*conn_close)(TfwConn *conn, bool sync); + /* + * Called to abort a connection intentionally on Tempesta side. + * This is rough connection closing without any notifications like TLS + * alerts, probably with TCP RST or just silent connection termination. + */ + void (*conn_abort)(TfwConn *conn); + /* * Called when closing a connection (client or server, * as in conn_init()). This is required for modules that @@ -517,6 +524,26 @@ tfw_connection_validate_cleanup(TfwConn *conn) BUG_ON(rc && rc != TFW_CONN_DEATHCNT); } +static inline int +tfw_peer_for_each_conn(TfwPeer *p, int (*cb)(TfwConn *)) +{ + int r = 0; + TfwConn *conn, *tmp_conn; + + spin_lock_bh(&p->conn_lock); + + /* @cb() may delete connections from the list. */ + list_for_each_entry_safe(conn, tmp_conn, &p->conn_list, list) { + r = cb(conn); + if (unlikely(r)) + break; + } + + spin_unlock_bh(&(p)->conn_lock); + + return r; +} + void tfw_connection_hooks_register(TfwConnHooks *hooks, int type); void tfw_connection_hooks_unregister(int type); int tfw_connection_send(TfwConn *conn, TfwMsg *msg); @@ -529,6 +556,7 @@ void tfw_connection_link_peer(TfwConn *conn, TfwPeer *peer); int tfw_connection_new(TfwConn *conn); void tfw_connection_repair(TfwConn *conn); int tfw_connection_close(TfwConn *conn, bool sync); +void tfw_connection_abort(TfwConn *conn); void tfw_connection_drop(TfwConn *conn); void tfw_connection_release(TfwConn *conn); diff --git a/fw/main.c b/fw/main.c index df10ddf6f..2d2386b80 100644 --- a/fw/main.c +++ b/fw/main.c @@ -28,10 +28,9 @@ #include "cfg.h" #include "client.h" #include "log.h" +#include "server.h" #include "str.h" #include "sync_socket.h" -#include "server.h" -#include "vhost.h" MODULE_AUTHOR(TFW_AUTHOR); MODULE_DESCRIPTION(TFW_NAME); @@ -151,7 +150,7 @@ tfw_mods_stop(void) mod->started = 0; tfw_ss_users -= mod->sock_user; - if (ss_synced || tfw_ss_users || tfw_runstate_is_reconfig()) + if (ss_synced || tfw_ss_users) continue; /* * Wait until all network activity is stopped before data in @@ -163,8 +162,12 @@ tfw_mods_stop(void) * client database must provide valid references to stored * clients. */ - ss_synchronize(); - ss_synced = false; + if (!ss_synchronize()) { + tfw_cli_abort_all(); + /* Check that all the connections are terminated now. */ + WARN_ON(!ss_synchronize()); + } + ss_synced = true; } BUG_ON(tfw_ss_users); @@ -219,7 +222,9 @@ tfw_mods_start(void) return ret; } mod->started = 1; - tfw_ss_users += mod->sock_user; + + if (!tfw_runstate_is_reconfig()) + tfw_ss_users += mod->sock_user; } T_DBG("modules are started\n"); diff --git a/fw/peer.h b/fw/peer.h index 5c2579b4d..7b94ac1e4 100644 --- a/fw/peer.h +++ b/fw/peer.h @@ -1,7 +1,7 @@ /** * Tempesta FW * - * Copyright (C) 2015-2018 Tempesta Technologies, Inc. + * Copyright (C) 2015-2022 Tempesta Technologies, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by @@ -61,24 +61,12 @@ tfw_peer_add_conn(TfwPeer *p, struct list_head *conn_list) static inline void tfw_peer_del_conn(TfwPeer *p, struct list_head *conn_list) { - spin_lock_bh(&p->conn_lock); + local_bh_disable(); + spin_lock_nested(&p->conn_lock, SINGLE_DEPTH_NESTING); list_del_init(conn_list); spin_unlock_bh(&p->conn_lock); } -#define tfw_peer_for_each_conn(p, conn, member, cb) \ -({ \ - int r = 0; \ - spin_lock_bh(&(p)->conn_lock); \ - list_for_each_entry(conn, &(p)->conn_list, member) { \ - r = (cb)(conn); \ - if (unlikely((r))) \ - break; \ - } \ - spin_unlock_bh(&(p)->conn_lock); \ - r; \ -}) - #endif /* __PEER_H__ */ diff --git a/fw/sock.c b/fw/sock.c index ec91a43a2..0c4dd7422 100644 --- a/fw/sock.c +++ b/fw/sock.c @@ -189,7 +189,7 @@ ss_active_guard_exit(unsigned long val) } static void -__ss_conn_drop_guard_exit(struct sock *sk) +ss_conn_drop_guard_exit(struct sock *sk) { SS_CALL(connection_drop, sk); ss_active_guard_exit(SS_V_ACT_LIVECONN); @@ -538,19 +538,13 @@ EXPORT_SYMBOL(ss_send); * Note: it used to be called in process context as well, at the time when * Tempesta starts or stops. That's not the case right now, but it may change. * - * TODO In some cases we need to close socket aggressively w/o FIN_WAIT_2 state, - * e.g. by sending RST. So we need to add second parameter to the function - * which says how to close the socket. - * One of the examples is rcl_req_limit() (it should reset connections). - * See tcp_sk(sk)->linger2 processing in standard tcp_close(). - * * Called with locked socket. */ static void -ss_do_close(struct sock *sk) +ss_do_close(struct sock *sk, int flags) { struct sk_buff *skb; - int data_was_unread = 0; + bool data_was_unread = false; T_DBG2("[%d]: Close socket %p (%s): account=%d refcnt=%u\n", smp_processor_id(), sk, ss_statename[sk->sk_state], @@ -576,10 +570,8 @@ ss_do_close(struct sock *sk) /* The below is mostly copy-paste from tcp_close(). */ sk->sk_shutdown = SHUTDOWN_MASK; - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - - tcp_hdr(skb)->fin; - data_was_unread += len; + while ((skb = __skb_dequeue(&sk->sk_receive_queue))) { + data_was_unread = true; T_DBG3("[%d]: free rcv skb %p\n", smp_processor_id(), skb); __kfree_skb(skb); } @@ -589,8 +581,13 @@ ss_do_close(struct sock *sk) if (sk->sk_state == TCP_CLOSE) goto adjudge_to_death; - if (data_was_unread) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); + if (data_was_unread || (flags & __SS_F_RST)) { + if ((flags & __SS_F_RST)) { + sk->sk_err = ECONNRESET; + sk->sk_shutdown = SHUTDOWN_MASK; + } else { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); + } tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation); } @@ -660,8 +657,15 @@ ss_do_close(struct sock *sk) } if (sk->sk_state == TCP_CLOSE) { struct request_sock *req = tcp_sk(sk)->fastopen_rsk; - if (req != NULL) + if (req) reqsk_fastopen_remove(sk, req, false); + if (flags & __SS_F_RST) + /* + * Evict all data for transmission since we might never + * have enough window from the malicious/misbehaving client. + * Receive queue is purged in inet_csk_destroy_sock(). + */ + tcp_write_queue_purge(sk); inet_csk_destroy_sock(sk); } } @@ -678,8 +682,8 @@ ss_do_close(struct sock *sk) static void ss_linkerror(struct sock *sk) { - ss_do_close(sk); - __ss_conn_drop_guard_exit(sk); + ss_do_close(sk, 0); + ss_conn_drop_guard_exit(sk); sock_put(sk); /* paired with ss_do_close() */ } @@ -753,7 +757,7 @@ ss_tcp_process_skb(struct sock *sk, struct sk_buff *skb, int *processed) if (ss_skb_unroll(&skb_head, skb)) { __kfree_skb(skb); - return SS_DROP; + return SS_BAD; } while ((skb = ss_skb_dequeue(&skb_head))) { @@ -777,7 +781,7 @@ ss_tcp_process_skb(struct sock *sk, struct sk_buff *skb, int *processed) if (unlikely(offset > 0 && ss_skb_chop_head_tail(NULL, skb, offset, 0) != 0)) { - r = SS_DROP; + r = SS_BAD; goto out; } offset = 0; @@ -809,7 +813,7 @@ ss_tcp_process_skb(struct sock *sk, struct sk_buff *skb, int *processed) T_DBG2("Received data FIN on sk=%p, cpu=%d\n", sk, smp_processor_id()); ++tp->copied_seq; - r = SS_DROP; + r = SS_BAD; } out: if (skb_head) @@ -827,11 +831,10 @@ ss_tcp_process_skb(struct sock *sk, struct sk_buff *skb, int *processed) * * TODO #873 process URG. */ -static bool +static int ss_tcp_process_data(struct sock *sk) { - bool droplink = true; - int r, count, processed = 0; + int r = 0, count, processed = 0; unsigned int skb_len, skb_seq; struct sk_buff *skb, *tmp; struct tcp_sock *tp = tcp_sk(sk); @@ -858,14 +861,13 @@ ss_tcp_process_data(struct sock *sk) processed += count; if (r < 0) - goto out; - else if (!count) + break; + if (!count) T_WARN("recvmsg bug: overlapping TCP segment at %X" " seq %X rcvnxt %X len %x\n", tp->copied_seq, skb_seq, tp->rcv_nxt, skb_len); } - droplink = false; out: /* * Recalculate an appropriate TCP receive buffer space @@ -875,7 +877,7 @@ ss_tcp_process_data(struct sock *sk) if (processed) tcp_cleanup_rbuf(sk, processed); - return droplink; + return r; } /* @@ -901,37 +903,13 @@ ss_tcp_data_ready(struct sock *sk) * See sock_queue_err_skb() in linux/net/core/skbuff.c. */ T_ERR("error data in socket %p\n", sk); + return; } - else if (!skb_queue_empty(&sk->sk_receive_queue)) { - if (ss_tcp_process_data(sk) && - !(SS_CONN_TYPE(sk) & Conn_Stop)) { - /* - * Close connection in case of internal errors, - * banned packets, or FIN in the received packet, - * and only if it's not on hold until explicitly - * closed. - * - * ss_close() is responsible for calling - * application layer connection closing callback. - * The callback will free all SKBs linked with - * the message that is currently being processed. - * - * Closing a socket should go through the queue and - * should be done after all pending data has been sent. - * - * TODO #861. ss_tcp_process_data() returns true/false - * on all kind of problems: e.g. inability to unroll an - * skb and a security event. However, the problems are - * very different - we should send pending data in first - * case (SS_BAD) and send RST in the second (SS_DROP). - */ - ss_close(sk, SS_F_SYNC); - } - } - else { + + if (skb_queue_empty(&sk->sk_receive_queue)) { /* * Check for URG data. - * TODO shouldn't we do it in ss_tcp_process_data()? + * TODO #873: shouldn't we do it in ss_tcp_process_data()? */ struct tcp_sock *tp = tcp_sk(sk); if (tp->urg_data & TCP_URG_VALID) { @@ -940,6 +918,32 @@ ss_tcp_data_ready(struct sock *sk) smp_processor_id(), sk); } } + + switch (ss_tcp_process_data(sk)) { + case SS_OK: + return; + case SS_BAD: + /* + * Close connection in case of internal errors, + * banned packets, or FIN in the received packet, + * and only if it's not on hold until explicitly + * closed. + * + * ss_close() is responsible for calling + * application layer connection closing callback. + * The callback will free all SKBs linked with + * the message that is currently being processed. + * + * Closing a socket should go through the queue and + * should be done after all pending data has been sent. + */ + if (!(SS_CONN_TYPE(sk) & Conn_Stop)) + ss_close(sk, SS_F_SYNC); + break; + case SS_DROP: + ss_close(sk, SS_F_ABORT); + break; + } } /** @@ -972,7 +976,7 @@ ss_tcp_state_change(struct sock *sk) * it on our own without calling upper layer hooks. */ if (ss_active_guard_enter(SS_V_ACT_NEWCONN)) { - ss_do_close(sk); + ss_do_close(sk, 0); sock_put(sk); /* * The case of a connect to an upstream server that @@ -980,12 +984,12 @@ ss_tcp_state_change(struct sock *sk) * and ss_active_guard_enter() there. */ if (!lsk) - __ss_conn_drop_guard_exit(sk); + ss_conn_drop_guard_exit(sk); return; } if (lsk && ss_active_guard_enter(SS_V_ACT_LIVECONN)) { - ss_do_close(sk); + ss_do_close(sk, 0); sock_put(sk); ss_active_guard_exit(SS_V_ACT_NEWCONN); return; @@ -1003,6 +1007,7 @@ ss_tcp_state_change(struct sock *sk) if (r) { T_DBG2("[%d]: New connection hook failed, r=%d\n", smp_processor_id(), r); + /* ss_linkerror() decrements SS_V_ACT_LIVECONN. */ ss_linkerror(sk); ss_active_guard_exit(SS_V_ACT_NEWCONN); return; @@ -1047,17 +1052,12 @@ ss_tcp_state_change(struct sock *sk) } else if (sk->sk_state == TCP_CLOSE) { /* - * In current implementation we never reach TCP_CLOSE state - * in regular course of action. When a socket is moved from - * TCP_ESTABLISHED state to a closing state, we forcefully - * close the socket before it can reach the final state. - * - * We get here when an error has occurred in the connection. - * It could be that RST was received which may happen for - * multiple reasons. Or it could be a case of TCP timeout - * where the connection appears to be dead. In all of these - * cases the socket is moved directly to TCP_CLOSE state - * thus skipping all other states. + * We reach the state on regular tcp_close() (including the + * active closing from our side), tcp_abort() or tcp_done() + * in case of connection errors/RST and also tcp_fin() -> + * tcp_time_wait() for FIN_WAIT_2 and TIME_WAIT (also active + * closing) lead to tcp_done(). Note that we get here also on + * concurrent closing (TCP_CLOSING). * * It's safe to call the callback since we set socket callbacks * either for just created, not connected, sockets or in the @@ -1379,13 +1379,14 @@ ss_getpeername(struct sock *sk, TfwAddr *addr) } EXPORT_SYMBOL(ss_getpeername); -#define __sk_close_locked(sk) \ -do { \ - ss_do_close(sk); \ - bh_unlock_sock(sk); \ - __ss_conn_drop_guard_exit(sk); \ - sock_put(sk); /* paired with ss_do_close() */ \ -} while (0) +static void +__sk_close_locked(struct sock *sk, int flags) +{ + ss_do_close(sk, flags); + bh_unlock_sock(sk); + ss_conn_drop_guard_exit(sk); + sock_put(sk); /* paired with ss_do_close() */ +} static void ss_tx_action(void) @@ -1434,7 +1435,7 @@ ss_tx_action(void) break; } /* paired with bh_lock_sock() */ - __sk_close_locked(sk); + __sk_close_locked(sk, sw.flags); break; case SS_CLOSE: /* @@ -1456,7 +1457,7 @@ ss_tx_action(void) break; } /* paired with bh_lock_sock() */ - __sk_close_locked(sk); + __sk_close_locked(sk, sw.flags); break; default: BUG(); @@ -1544,7 +1545,7 @@ EXPORT_SYMBOL(ss_wait_newconn); * SS upcalls are protected with SS_V_ACT_LIVECONN. * Can sleep, so must be called from user-space context. */ -void +bool ss_synchronize(void) { int cpu, wq_acc = 0, wq_acc_old = 0; @@ -1581,14 +1582,16 @@ ss_synchronize(void) acm = &per_cpu(__ss_act_cnt, cpu); T_WARN(" cpu %d(%d), backlog size %lu," " active connections mask %#lx," - " cntwork queue size %d\n", + " cntwork queue size %d," + " close backlog is %sempty\n", cpu, smp_processor_id(), cb->size, (unsigned long)atomic64_read(acm), - tfw_wq_size(wq)); + tfw_wq_size(wq), + list_empty(&cb->head) + ? "" : "NOT "); } - T_WARN("Memory leakage is possible\n"); - return; + return false; } } else if (acc + wq_acc < acc_old + wq_acc_old) { @@ -1599,6 +1602,8 @@ ss_synchronize(void) wq_acc_old = wq_acc; acc = wq_acc = 0; } + + return true; } /** diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index 16d5b42b2..7648bad98 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -307,9 +307,10 @@ __cli_conn_close_cb(TfwConn *conn) } static int -__cli_conn_close_sync_cb(TfwConn *conn) +__cli_conn_abort_cb(TfwConn *conn) { - return tfw_connection_close(conn, true); + tfw_connection_abort(conn); + return 0; } /** @@ -320,10 +321,7 @@ __cli_conn_close_sync_cb(TfwConn *conn) static int tfw_cli_conn_close_all(void *data) { - TfwClient *cli = (TfwClient *)data; - TfwConn *conn; - - return tfw_peer_for_each_conn(cli, conn, list, __cli_conn_close_cb); + return tfw_peer_for_each_conn((TfwPeer *)data, __cli_conn_close_cb); } /** @@ -333,12 +331,10 @@ tfw_cli_conn_close_all(void *data) * connections, trying to cause a work queue overrun and delay security events * handlers. To detach attackers efficiently, we have to use synchronous close. */ -int tfw_cli_conn_close_all_sync(TfwClient *cli) +static int +tfw_cli_conn_abort_all(void *data) { - TfwConn *conn; - - return tfw_peer_for_each_conn(cli, conn, list, - __cli_conn_close_sync_cb); + return tfw_peer_for_each_conn((TfwPeer *)data, __cli_conn_abort_cb); } /* @@ -630,8 +626,7 @@ tfw_listen_socks_array_cmp(const void *l, const void *r) if (cmp) return cmp; - else - return (int)a->sin6_port - (int)b->sin6_port; + return (int)a->sin6_port - (int)b->sin6_port; } /** @@ -770,6 +765,21 @@ tfw_sock_clnt_stop(void) local_bh_enable(); } +/** + * Something wrong went on the network layer, e.g. many ACK segment drops and + * some TLS sockets can not make progress on data transmission, so client + * connection closing callbacks weren't called. This is unlikely, but probable, + * situation. Do hard connections termination. + */ +void +tfw_cli_abort_all(void) +{ + local_bh_disable(); + while (tfw_client_for_each(tfw_cli_conn_abort_all)) + ; + local_bh_enable(); +} + static TfwCfgSpec tfw_sock_clnt_specs[] = { { .name = "listen", diff --git a/fw/sock_srv.c b/fw/sock_srv.c index 31fed9f56..dd9722c3c 100644 --- a/fw/sock_srv.c +++ b/fw/sock_srv.c @@ -572,10 +572,7 @@ tfw_sock_srv_connect_srv(TfwServer *srv) static int tfw_sock_srv_disconnect_srv(TfwServer *srv) { - TfwConn *conn; - - return tfw_peer_for_each_conn(srv, conn, list, - tfw_sock_srv_disconnect); + return tfw_peer_for_each_conn((TfwPeer *)srv, tfw_sock_srv_disconnect); } /* diff --git a/fw/ss_skb.h b/fw/ss_skb.h index 423cb0e56..b0e4f8e8d 100644 --- a/fw/ss_skb.h +++ b/fw/ss_skb.h @@ -3,7 +3,7 @@ * * Synchronous Sockets API for Linux socket buffers manipulation. * - * Copyright (C) 2015-2021 Tempesta Technologies, Inc. + * Copyright (C) 2015-2022 Tempesta Technologies, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by @@ -38,7 +38,7 @@ enum { SS_SHUTDOWN = -4, /* Generic socket error. */ SS_BAD = -3, - /* The packet must be dropped. */ + /* The packet must be dropped (typically on a security event). */ SS_DROP = -2, /* The packet should be stashed (made by callback). */ SS_POSTPONE = -1, diff --git a/fw/sync_socket.h b/fw/sync_socket.h index 406bf0e55..b9720947a 100644 --- a/fw/sync_socket.h +++ b/fw/sync_socket.h @@ -119,6 +119,9 @@ ss_proto_inherit(const SsProto *parent, SsProto *child) #define SS_F_CONN_CLOSE 0x04 /* Call TLS encryption hook on the skb transmission. */ #define SS_F_ENCRYPT 0x08 +/* Close with TCP RST (connection abort). */ +#define __SS_F_RST 0x10 +#define SS_F_ABORT (__SS_F_RST | SS_F_SYNC) /* Conversion of skb type (flag) to/from TLS record type. */ #define SS_SKB_TYPE2F(t) (((int)(t)) << 8) @@ -139,7 +142,7 @@ int ss_bind(struct sock *sk, const TfwAddr *addr); int ss_listen(struct sock *sk, int backlog); void ss_getpeername(struct sock *sk, TfwAddr *addr); void ss_wait_newconn(void); -void ss_synchronize(void); +bool ss_synchronize(void); void ss_start(void); void ss_stop(void); bool ss_active(void); diff --git a/fw/t/unit/helpers.c b/fw/t/unit/helpers.c index 8ea05788d..4146a4ec3 100644 --- a/fw/t/unit/helpers.c +++ b/fw/t/unit/helpers.c @@ -15,7 +15,7 @@ * and generic testing functions/macros are located in test.c/test.h * * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). - * Copyright (C) 2015-2021 Tempesta Technologies, Inc. + * Copyright (C) 2015-2022 Tempesta Technologies, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by @@ -150,9 +150,10 @@ ss_close(struct sock *sk, int flags) return 0; } -void +bool ss_synchronize(void) { + return true; } void @@ -183,6 +184,11 @@ tfw_cli_conn_send(TfwCliConn *cli_conn, TfwMsg *msg) return 0; } +void +tfw_cli_abort_all(void) +{ +} + int tfw_gfsm_dispatch(TfwGState *st, void *obj, TfwFsmData *data) { diff --git a/fw/tls.c b/fw/tls.c index 0f6aa772a..c9e593ad0 100644 --- a/fw/tls.c +++ b/fw/tls.c @@ -139,7 +139,7 @@ tfw_tls_connection_recv(TfwConn *conn, struct sk_buff *skb) if (unlikely(!nskb)) { spin_unlock(&tls->lock); TFW_INC_STAT_BH(clnt.msgs_otherr); - return T_DROP; + return T_BAD; } } @@ -240,7 +240,7 @@ tfw_tls_tcp_propagate_dseq(struct sock *sk, struct sk_buff *skb) * can add the next skb in the send queue to the current encrypted TLS record. * * We extend the skbs on TCP transmission (when CWND is calculated), so we - * also adjust TPC sequence numbers in the socket. See skb_entail(). + * also adjust TCP sequence numbers in the socket. See skb_entail(). */ int tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) @@ -268,6 +268,8 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) struct page **pages = NULL, **pages_end, **p; struct page *auto_pages[AUTO_SEGS_N]; + assert_spin_locked(&sk->sk_lock.slock); + /* * If client closes connection early, we may get here with sk_user_data * being NULL. @@ -499,6 +501,13 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) (io->alert[1] == TTLS_ALERT_MSG_CLOSE_NOTIFY || io->alert[0] == TTLS_ALERT_LEVEL_FATAL)) { + /* + * If we're not done with transmission within the current + * tcp_write_xmit() call, then the delayed ss_close() socket + * freeing might kill the socket concurrently with TCP + * transmission process leading to NULL pointer dereference. + */ + WARN_ON_ONCE(!tcp_skb_is_last(sk, skb_tail)); ss_close(sk, SS_F_SYNC); } @@ -512,27 +521,17 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * We can not send unencrypted data and can not normally close the * socket with FIN since we're in progress on sending from the write * queue. - * - * TODO #861 Send RST, move the socket to dead state, and drop all - * the pending unencrypted data. We can not use tcp_v4_send_reset() - * since it works solely in response to ingress segment. */ err_kill_sock: - if (!sock_flag(sk, SOCK_DEAD)) { - sk->sk_err = ECONNRESET; - tcp_set_state(sk, TCP_CLOSE); - sk->sk_shutdown = SHUTDOWN_MASK; - sock_set_flag(sk, SOCK_DEAD); - } + ss_close(sk, SS_F_ABORT); + goto err_epilogue; err_purge_tcp_write_queue: /* * Leave encrypted segments in the retransmission rb-tree, * but purge the send queue on unencrypted segments. */ - while ((skb = tcp_send_head(sk))) { - __skb_unlink(skb, &sk->sk_write_queue); - sk_wmem_free_skb(sk, skb); - } + tcp_write_queue_purge(sk); +err_epilogue: T_WARN("%s: cannot encrypt data (%d), only partial data was sent\n", __func__, r); return r; @@ -696,7 +695,8 @@ tfw_tls_conn_close(TfwConn *c, bool sync) spin_unlock(&tls->lock); /* - * ttls_close_notify() calls ss_send() with SS_F_CONN_CLOSE flag, so + * Once the TLS close notify alert is going to be sent by + * tcp_write_xmit(), tfw_tls_encrypt() calls ss_close(), so * if the call succeeded, then we'll close the socket with the alert * transmission. Otherwise if we have to close the socket * and can not write to the socket, then there is no other way than @@ -714,6 +714,12 @@ tfw_tls_conn_close(TfwConn *c, bool sync) return r; } +static void +tfw_tls_conn_abort(TfwConn *c) +{ + ss_close(c->sk, SS_F_ABORT); +} + static void tfw_tls_conn_drop(TfwConn *c) { @@ -765,6 +771,7 @@ tfw_tls_conn_send(TfwConn *c, TfwMsg *msg) static TfwConnHooks tls_conn_hooks = { .conn_init = tfw_tls_conn_init, .conn_close = tfw_tls_conn_close, + .conn_abort = tfw_tls_conn_abort, .conn_drop = tfw_tls_conn_drop, .conn_send = tfw_tls_conn_send, }; diff --git a/fw/websocket.c b/fw/websocket.c index 832387839..f2440ce16 100644 --- a/fw/websocket.c +++ b/fw/websocket.c @@ -200,6 +200,14 @@ tfw_ws_conn_close(TfwConn *conn, bool sync) return r; } +static void +tfw_ws_conn_abort(TfwConn *conn) +{ + T_DBG("%s: conn=[%p]\n", __func__, conn); + + tfw_conn_hook_call(TFW_CONN_HTTP_TYPE(conn), conn, conn_abort); +} + static TfwConn * tfw_ws_conn_unpair(TfwConn *conn) { @@ -251,12 +259,14 @@ tfw_ws_conn_send(TfwConn *conn, TfwMsg *msg) static TfwConnHooks ws_conn_hooks = { .conn_close = tfw_ws_conn_close, + .conn_abort = tfw_ws_conn_abort, .conn_drop = tfw_ws_conn_drop, .conn_send = tfw_ws_conn_send, }; static TfwConnHooks wss_conn_hooks = { .conn_close = tfw_ws_conn_close, + .conn_abort = tfw_ws_conn_abort, .conn_drop = tfw_ws_conn_drop, .conn_send = tfw_ws_conn_send, }; diff --git a/lib/log.h b/lib/log.h index 3256fc2d9..ec1d07299 100644 --- a/lib/log.h +++ b/lib/log.h @@ -1,7 +1,7 @@ /** * Tempesta kernel library * - * Copyright (C) 2015-2018 Tempesta Technologies, INC. + * Copyright (C) 2015-2022 Tempesta Technologies, INC. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by @@ -27,7 +27,7 @@ enum { /* Generic error. */ T_BAD = -3, - /* The message must be dropped. */ + /* The message must be dropped (typically on a security event). */ T_DROP = -2, /* The message should be stashed (made by callback). */ T_POSTPONE = -1, diff --git a/linux-5.10.35.patch b/linux-5.10.35.patch index 211a7a1f2..e6974414d 100644 --- a/linux-5.10.35.patch +++ b/linux-5.10.35.patch @@ -677,7 +677,7 @@ index a828cf99c..b877eb543 100644 * @skb: buffer to check diff --git a/include/linux/tempesta.h b/include/linux/tempesta.h new file mode 100644 -index 000000000..55049bd32 +index 000000000..8e9b6af75 --- /dev/null +++ b/include/linux/tempesta.h @@ -0,0 +1,54 @@ @@ -925,10 +925,10 @@ index d73aed0fc..d19a4ecc1 100644 +obj-$(CONFIG_SECURITY_TEMPESTA) += tempesta_mm.o diff --git a/mm/tempesta_mm.c b/mm/tempesta_mm.c new file mode 100644 -index 000000000..9dc507aab +index 000000000..7ee3ead54 --- /dev/null +++ b/mm/tempesta_mm.c -@@ -0,0 +1,278 @@ +@@ -0,0 +1,274 @@ +/** + * Tempesta Memory Reservation + * @@ -1995,7 +1995,7 @@ index 45fb450b4..48da5be43 100644 offset++; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index 2384ac048..b891e370b 100644 +index 2384ac048..920b1f01f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -322,6 +322,7 @@ DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); @@ -2110,6 +2110,14 @@ index 2384ac048..b891e370b 100644 void tcp_close(struct sock *sk, long timeout) { +@@ -2627,6 +2644,7 @@ void tcp_write_queue_purge(struct sock *sk) + tcp_sk(sk)->packets_out = 0; + inet_csk(sk)->icsk_backoff = 0; + } ++EXPORT_SYMBOL_GPL(tcp_write_queue_purge); + + int tcp_disconnect(struct sock *sk, int flags) + { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fac5c1469..623d4f33e 100644 --- a/net/ipv4/tcp_input.c @@ -2521,7 +2529,7 @@ index 000000000..4c439ac0c +tempesta-y := tempesta_lsm.o diff --git a/security/tempesta/tempesta_lsm.c b/security/tempesta/tempesta_lsm.c new file mode 100644 -index 000000000..13054fb4a +index 000000000..b8e8d67dc --- /dev/null +++ b/security/tempesta/tempesta_lsm.c @@ -0,0 +1,138 @@ diff --git a/tls/ttls.c b/tls/ttls.c index d468397c0..570c5aba2 100644 --- a/tls/ttls.c +++ b/tls/ttls.c @@ -1298,14 +1298,14 @@ ttls_handle_alert(TlsCtx *tls) /* Ignore non-fatal alerts, except close_notify. */ if (io->alert[0] == TTLS_ALERT_LEVEL_FATAL) { T_DBG2("is a fatal alert message (msg %d)\n", io->alert[1]); - return T_DROP; + return T_BAD; } if (io->alert[0] == TTLS_ALERT_LEVEL_WARNING && io->alert[1] == TTLS_ALERT_MSG_CLOSE_NOTIFY) { T_DBG2("is a close notify message\n"); ttls_close_notify(tls); - return T_DROP; + return T_BAD; } /* Silently ignore: fetch new message */ @@ -2185,8 +2185,7 @@ ttls_recv(void *tls_data, unsigned char *buf, unsigned int len, unsigned int *re if (unlikely(!ttls_xfrm_ready(tls))) { if (!(r = ttls_handle_alert(tls))) return T_OK; - TTLS_WARN(tls, "Bad TLS alert on handshake\n"); - return T_DROP; + return T_BAD; } break; @@ -2199,7 +2198,7 @@ ttls_recv(void *tls_data, unsigned char *buf, unsigned int len, unsigned int *re TTLS_WARN(tls, "refusing renegotiation, sending alert\n"); ttls_send_alert(tls, TTLS_ALERT_LEVEL_FATAL, TTLS_ALERT_MSG_NO_RENEGOTIATION); - return T_DROP; + return T_BAD; } /* @@ -2237,7 +2236,7 @@ ttls_recv(void *tls_data, unsigned char *buf, unsigned int len, unsigned int *re */ if (unlikely(tls->state != TTLS_HANDSHAKE_OVER)) { TTLS_WARN(tls, "TLS context isn't ready after handshake\n"); - return T_DROP; + return T_BAD; } break; } @@ -2255,13 +2254,13 @@ ttls_recv(void *tls_data, unsigned char *buf, unsigned int len, unsigned int *re if ((r = ttls_decrypt(tls, NULL))) { TTLS_WARN(tls, "TLS cannot decrypt msg on state %x, ret=%d%s\n", tls->state, r, r == -EBADMSG ? "(bad ciphertext)" : ""); - return T_DROP; + return T_BAD; } if (io->msgtype == TTLS_MSG_ALERT) { if (!(r = ttls_handle_alert(tls))) return T_OK; - return T_DROP; + return T_BAD; } return T_OK; From 614a4944b9470b25eba0a93f5fa3e5fb2e6f13fc Mon Sep 17 00:00:00 2001 From: Alexander K Date: Wed, 17 Aug 2022 01:39:17 +0300 Subject: [PATCH 14/26] tfw_http_search_cookie() may pass chunk == end, which is past the last chunk, to tfw_str_collect_cmp(). Firstly check chunk == end in tfw_str_collect_cmp() and only after than make an assertion that chunk is a plain string. --- fw/http_match.c | 2 ++ fw/str.c | 1 + 2 files changed, 3 insertions(+) diff --git a/fw/http_match.c b/fw/http_match.c index f5b075791..2c422c764 100644 --- a/fw/http_match.c +++ b/fw/http_match.c @@ -915,6 +915,7 @@ tfw_http_search_cookie(const char *cstr, unsigned long clen, TfwStr *chunk, *end; TfwStr tmp = { 0 }; unsigned int n = cookie->nchunks; + /* Search cookie name. */ end = cookie->chunks + cookie->nchunks; for (chunk = cookie->chunks; chunk != end; ++chunk, --n) { @@ -960,6 +961,7 @@ tfw_http_search_cookie(const char *cstr, unsigned long clen, } } else { + WARN_ON_ONCE(1); continue; } /* diff --git a/fw/str.c b/fw/str.c index d9755b579..69cb04263 100644 --- a/fw/str.c +++ b/fw/str.c @@ -565,6 +565,7 @@ void tfw_str_collect_cmp(TfwStr *chunk, TfwStr *end, TfwStr *out, bzero_fast(out, sizeof(*out)); return; } + BUG_ON(!TFW_STR_PLAIN(chunk)); BUG_ON(!TFW_STR_PLAIN(chunk)); From f492a0b0af746c9617aafcc177323809856cf0c8 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Wed, 17 Aug 2022 13:50:34 +0300 Subject: [PATCH 15/26] Fix KASAN use-after-free in tfw_connection_drop(): conn_drop hook may free the conn and we should not dereference the pointer even for the assertion. --- fw/connection.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fw/connection.c b/fw/connection.c index 0e457d164..88741cc4a 100644 --- a/fw/connection.c +++ b/fw/connection.c @@ -86,7 +86,6 @@ tfw_connection_drop(TfwConn *conn) { /* Ask higher levels to free resources at connection close. */ TFW_CONN_HOOK_CALL(conn, conn_drop); - BUG_ON(conn->stream.msg); } /* From 617eb70d00d794dce0547e7cfed1f7cc2f10da38 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Wed, 17 Aug 2022 15:19:12 +0300 Subject: [PATCH 16/26] Add WS and WSS protocols for connection cache selector tfw_cli_cache() - it is mainly used on connection freeing since WS(S) is upgraded from HTTP(S). --- fw/sock_clnt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index 7648bad98..fa8776717 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -53,8 +53,10 @@ tfw_cli_cache(int type) case TFW_FSM_H2: return tfw_h2_conn_cache; case TFW_FSM_HTTPS: + case TFW_FSM_WSS: return tfw_https_conn_cache; case TFW_FSM_HTTP: + case TFW_FSM_WS: return tfw_h1_conn_cache; default: BUG(); From 9139078e5a59682913044e2eea191f5bc4318158 Mon Sep 17 00:00:00 2001 From: Constantine Date: Wed, 31 Aug 2022 17:51:56 +0300 Subject: [PATCH 17/26] Fix sockets hang after error during starting tempesta If one of sockets can't start listening in tfw_sock_clnt_start we just breaks the initialization loop and clenup some data, but already opened sockets won't be released, since module has not been started yet and tfw_sock_clnt_stop must not be called for such modules. Now has been added realising of sockets in tfw_listen_sock_del_all which must be called from cleanup callback even on tfw_sock_clnt_start failure. Also tfw_listen_sock_del_all now frees tfw_listen_socks list --- fw/sock_clnt.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index fa8776717..b21630c58 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -426,11 +426,22 @@ tfw_listen_sock_del_all(void) { TfwListenSock *ls, *tmp; + list_for_each_entry(ls, &tfw_listen_socks, list) { + if (ls->sk) + /* + * If error occurred during starting module, + * release sockets which were bound. + */ + ss_release(ls->sk); + kfree(ls); + } + list_for_each_entry_safe(ls, tmp, &tfw_listen_socks_reconf, list) { BUG_ON(ls->sk); kfree(ls); } + INIT_LIST_HEAD(&tfw_listen_socks); INIT_LIST_HEAD(&tfw_listen_socks_reconf); tfw_classifier_cleanup_inport(); } From 3d8037ce760424de953257bf65f8ddf3ad6fc165 Mon Sep 17 00:00:00 2001 From: Constantine Date: Thu, 1 Sep 2022 16:26:56 +0300 Subject: [PATCH 18/26] tfw_listen_sock_del_all: Fixed incorrect usage of list_for_each_entry --- fw/sock_clnt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index b21630c58..82b68042b 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -426,18 +426,20 @@ tfw_listen_sock_del_all(void) { TfwListenSock *ls, *tmp; - list_for_each_entry(ls, &tfw_listen_socks, list) { + list_for_each_entry_safe(ls, tmp, &tfw_listen_socks, list) { if (ls->sk) /* * If error occurred during starting module, * release sockets which were bound. */ ss_release(ls->sk); + list_del(&ls->list); kfree(ls); } list_for_each_entry_safe(ls, tmp, &tfw_listen_socks_reconf, list) { BUG_ON(ls->sk); + list_del(&ls->list); kfree(ls); } From 06b21579b5e8e5907fa0d8416af915b72989220b Mon Sep 17 00:00:00 2001 From: Alexander K Date: Fri, 2 Sep 2022 14:51:28 +0300 Subject: [PATCH 19/26] Update the code of ss_do_close() according to the current 5.10.35 code. --- fw/sock.c | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/fw/sock.c b/fw/sock.c index 0c4dd7422..cc1f1150e 100644 --- a/fw/sock.c +++ b/fw/sock.c @@ -567,7 +567,7 @@ ss_do_close(struct sock *sk, int flags) */ sk->sk_lock.owned = 1; - /* The below is mostly copy-paste from tcp_close(). */ + /* The below is mostly copy-paste from tcp_close(), 5.10.35. */ sk->sk_shutdown = SHUTDOWN_MASK; while ((skb = __skb_dequeue(&sk->sk_receive_queue))) { @@ -584,7 +584,6 @@ ss_do_close(struct sock *sk, int flags) if (data_was_unread || (flags & __SS_F_RST)) { if ((flags & __SS_F_RST)) { sk->sk_err = ECONNRESET; - sk->sk_shutdown = SHUTDOWN_MASK; } else { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); } @@ -592,30 +591,39 @@ ss_do_close(struct sock *sk, int flags) tcp_send_active_reset(sk, sk->sk_allocation); } else if (tcp_close_state(sk)) { - /* The code below is taken from tcp_send_fin(). */ + /* The code below is taken from tcp_send_fin(), 5.10.35. */ + struct sk_buff *skb, *tskb, *tail; struct tcp_sock *tp = tcp_sk(sk); - int mss_now = tcp_current_mss(sk); - skb = tcp_write_queue_tail(sk); + tskb = tail = tcp_write_queue_tail(sk); + if (!tskb && tcp_under_memory_pressure(sk)) + tskb = skb_rb_last(&sk->tcp_rtx_queue); - if (skb && tcp_send_head(sk)) { + if (tskb) { /* Send FIN with data if we have any. */ - TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; - TCP_SKB_CB(skb)->end_seq++; + TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; + TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; + if (!tail) { + WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1); + goto adjudge_to_death; + } } else { /* No data to send in the socket, allocate new skb. */ - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true); - if (!skb) { + skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); + if (unlikely(!skb)) { T_WARN("can't send FIN due to bad alloc"); - } else { - tcp_init_nondata_skb(skb, tp->write_seq, - TCPHDR_ACK | TCPHDR_FIN); - tcp_queue_skb(sk, skb); + goto adjudge_to_death; } + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); + skb_reserve(skb, MAX_TCP_HEADER); + ss_forced_mem_schedule(sk, skb->truesize); + tcp_init_nondata_skb(skb, tp->write_seq, + TCPHDR_ACK | TCPHDR_FIN); + tcp_queue_skb(sk, skb); } - __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); + __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); } adjudge_to_death: @@ -639,8 +647,7 @@ ss_do_close(struct sock *sk, int flags) if (sk->sk_state == TCP_FIN_WAIT2) { const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, - tmo - TCP_TIMEWAIT_LEN); + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); return; @@ -656,7 +663,10 @@ ss_do_close(struct sock *sk, int flags) } } if (sk->sk_state == TCP_CLOSE) { - struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + struct request_sock *req; + + req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + lockdep_sock_is_held(sk)); if (req) reqsk_fastopen_remove(sk, req, false); if (flags & __SS_F_RST) From 3a82e948bec6347689fea01c25b90a2d5aac1173 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Fri, 2 Sep 2022 15:07:03 +0300 Subject: [PATCH 20/26] Remove extra assertion (merged in other PR) --- fw/str.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fw/str.c b/fw/str.c index 69cb04263..857ab9f4d 100644 --- a/fw/str.c +++ b/fw/str.c @@ -567,8 +567,6 @@ void tfw_str_collect_cmp(TfwStr *chunk, TfwStr *end, TfwStr *out, } BUG_ON(!TFW_STR_PLAIN(chunk)); - BUG_ON(!TFW_STR_PLAIN(chunk)); - /* If this is last chunk, just return it in this case. */ next = chunk + 1; if (likely(next == end || (stop && *next->data == *stop))) { From eaabf8e19517ec8f842c04028ad85c15c008cc6c Mon Sep 17 00:00:00 2001 From: Alexander K Date: Fri, 2 Sep 2022 15:34:25 +0300 Subject: [PATCH 21/26] Fix memory leak in tfw_tls_conn_init() on unsuccessful call of tfw_conn_hook_call() or tfw_h2_context_init(): call TLS connection destructor on yet not fully initialized TLS connection handler. Nullify TfwHPackDTbl pool pointer after destruction to let tfw_hpack_clean() call on the handler without double-frees. --- fw/hpack.c | 5 ++--- fw/tls.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fw/hpack.c b/fw/hpack.c index c65753056..e60663169 100644 --- a/fw/hpack.c +++ b/fw/hpack.c @@ -386,8 +386,6 @@ do { \ WARN_ON_ONCE(hp->length); \ } while (0) -static unsigned long act_hp_str_n; - void write_int(unsigned long index, unsigned short max, unsigned short mask, TfwHPackInt *__restrict res_idx) @@ -1093,8 +1091,10 @@ tfw_hpack_init(TfwHPack *__restrict hp, unsigned int htbl_sz) err_et: tfw_pool_destroy(dt->h_pool); + dt->h_pool = NULL; err_dt: tfw_pool_destroy(dt->pool); + dt->pool = NULL; return -ENOMEM; } @@ -1105,7 +1105,6 @@ tfw_hpack_clean(TfwHPack *__restrict hp) tfw_pool_destroy(hp->enc_tbl.pool); tfw_pool_destroy(hp->dec_tbl.h_pool); tfw_pool_destroy(hp->dec_tbl.pool); - WARN_ON_ONCE(act_hp_str_n); } /* diff --git a/fw/tls.c b/fw/tls.c index c9e593ad0..ad596797a 100644 --- a/fw/tls.c +++ b/fw/tls.c @@ -666,12 +666,14 @@ tfw_tls_conn_init(TfwConn *c) return -EINVAL; } - if (tfw_conn_hook_call(TFW_FSM_HTTP, c, conn_init)) - return -EINVAL; + if (tfw_conn_hook_call(TFW_FSM_HTTP, c, conn_init)) { + r = -EINVAL; + goto err_cleanup; + } if (TFW_FSM_TYPE(c->proto.type) == TFW_FSM_H2) if ((r = tfw_h2_context_init(tfw_h2_context(c)))) - return r; + goto err_cleanup; /* * We never hook TLS connections in GFSM, but initialize it with 0 state @@ -682,6 +684,9 @@ tfw_tls_conn_init(TfwConn *c) c->destructor = tfw_tls_conn_dtor; return 0; +err_cleanup: + tfw_tls_conn_dtor(c); + return r; } static int From f0fa97aa3a38183f70a23133ba19ac00a192a7d1 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Fri, 2 Sep 2022 16:05:02 +0300 Subject: [PATCH 22/26] Better comment for tfw_peer_for_each_conn() about server and client connection drops. --- fw/connection.h | 9 ++++++++- fw/sock_srv.c | 6 +++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/fw/connection.h b/fw/connection.h index e23b61f60..80eef3b9f 100644 --- a/fw/connection.h +++ b/fw/connection.h @@ -532,7 +532,14 @@ tfw_peer_for_each_conn(TfwPeer *p, int (*cb)(TfwConn *)) spin_lock_bh(&p->conn_lock); - /* @cb() may delete connections from the list. */ + /* + * @cb() may delete connections from the list. + * Typically, this happens on connection_drop callbacks on sockets closing. + * However, note that client and server connections drops are logically + * different: client connections are just freed with all linked resources, + * while the high level server connection handlers are preserved for + * connection repair and freed on shutdown only. + */ list_for_each_entry_safe(conn, tmp_conn, &p->conn_list, list) { r = cb(conn); if (unlikely(r)) diff --git a/fw/sock_srv.c b/fw/sock_srv.c index dd9722c3c..3da2e8a8f 100644 --- a/fw/sock_srv.c +++ b/fw/sock_srv.c @@ -626,6 +626,8 @@ tfw_srv_conn_free(TfwSrvConn *srv_conn) { BUG_ON(timer_pending(&srv_conn->timer)); + tfw_connection_unlink_from_peer((TfwConn *)srv_conn); + /* Check that all nested resources are freed. */ tfw_connection_validate_cleanup((TfwConn *)srv_conn); BUG_ON(!list_empty(&srv_conn->nip_queue)); @@ -680,10 +682,8 @@ tfw_sock_srv_del_conns(void *psrv) TfwSrvConn *srv_conn, *tmp; TfwServer *srv = psrv; - list_for_each_entry_safe(srv_conn, tmp, &srv->conn_list, list) { - tfw_connection_unlink_from_peer((TfwConn *)srv_conn); + list_for_each_entry_safe(srv_conn, tmp, &srv->conn_list, list) tfw_srv_conn_free(srv_conn); - } } static int From 2f3a6a37a9754e48cb3496a1c8cd91e6075f23ff Mon Sep 17 00:00:00 2001 From: Alexander K Date: Fri, 2 Sep 2022 17:01:49 +0300 Subject: [PATCH 23/26] Fix a typo and unnecessary nullification --- fw/sock_clnt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index 82b68042b..f7123ad76 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -721,7 +721,6 @@ tfw_sock_clnt_start(void) if (!ls->sk) continue; ss_release(ls->sk); - ls->sk = NULL; kfree(ls); } @@ -850,7 +849,7 @@ tfw_sock_clnt_init(void) tfw_https_conn_cache = kmem_cache_create("tfw_https_conn_cache", sizeof(TfwTlsConn), 0, 0, NULL); if (!tfw_https_conn_cache) { - kmem_cache_destroy(tfw_https_conn_cache); + kmem_cache_destroy(tfw_h1_conn_cache); return -ENOMEM; } From e8cfb48efb3a17aa87b7f25c6bc83d62fa51f5a8 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Sun, 4 Sep 2022 16:46:19 +0300 Subject: [PATCH 24/26] Fix the race between tfw_sock_clnt_new() callback and tfw_sock_clnt_start(): while a new child socket is created and initialized by the listening socket, the listening socket, whcih was removed from the reloaded configuration, is freed during the reconfiguration. There actually was no requirement to use the listening socket in the child initialization. The patch removes `listener` from SsProto as doubling TfwListenSock->sk, so all connections will use smaller memory. Also remove the unused synchronous sockets tests. --- fw/sock.c | 47 +-- fw/sock_clnt.c | 81 +++--- fw/sync_socket.h | 10 +- fw/t/bomber.c | 1 - fw/t/sync_sockets/Makefile | 59 ---- fw/t/sync_sockets/client.cc | 121 -------- fw/t/sync_sockets/kernel/Makefile | 17 -- fw/t/sync_sockets/kernel/kserver.c | 365 ------------------------ fw/t/sync_sockets/kernel/sync_kclient.c | 338 ---------------------- fw/t/sync_sockets/kernel/sync_kserver.c | 194 ------------- fw/t/sync_sockets/server.cc | 283 ------------------ 11 files changed, 66 insertions(+), 1450 deletions(-) delete mode 100644 fw/t/sync_sockets/Makefile delete mode 100644 fw/t/sync_sockets/client.cc delete mode 100644 fw/t/sync_sockets/kernel/Makefile delete mode 100644 fw/t/sync_sockets/kernel/kserver.c delete mode 100644 fw/t/sync_sockets/kernel/sync_kclient.c delete mode 100644 fw/t/sync_sockets/kernel/sync_kserver.c delete mode 100644 fw/t/sync_sockets/server.cc diff --git a/fw/sock.c b/fw/sock.c index cc1f1150e..b5aac1637 100644 --- a/fw/sock.c +++ b/fw/sock.c @@ -969,9 +969,13 @@ ss_tcp_state_change(struct sock *sk) TFW_VALIDATE_SK_LOCK_OWNER(sk); if (sk->sk_state == TCP_ESTABLISHED) { - /* Process the new TCP connection. */ - SsProto *proto = sk->sk_user_data; - struct sock *lsk = proto->listener; + /* + * Process the new TCP connection. + * The kernel sets sk_allocation to GFP_KERNEL, so this way we + * cad differentiate server sockets, created by as, and client + * sockets created by the kernel. + */ + bool is_srv_sock = (sk->sk_allocation == GFP_ATOMIC); int r; /* @@ -993,12 +997,12 @@ ss_tcp_state_change(struct sock *sk) * cannot be completed now. Paired with ss_connect() * and ss_active_guard_enter() there. */ - if (!lsk) + if (is_srv_sock) ss_conn_drop_guard_exit(sk); return; } - if (lsk && ss_active_guard_enter(SS_V_ACT_LIVECONN)) { + if (!is_srv_sock && ss_active_guard_enter(SS_V_ACT_LIVECONN)) { ss_do_close(sk, 0); sock_put(sk); ss_active_guard_exit(SS_V_ACT_NEWCONN); @@ -1024,16 +1028,11 @@ ss_tcp_state_change(struct sock *sk) } sock_set_flag(sk, SOCK_TEMPESTA); - if (lsk) { - /* - * This is a new socket for an accepted connect - * request that the kernel has allocated itself. - * Kernel initializes this field to GFP_KERNEL. - * Tempesta works with sockets in SoftIRQ context, - * so set it to atomic allocation. - */ - sk->sk_allocation = GFP_ATOMIC; - } + /* + * Tempesta works with sockets in SoftIRQ context, so always use + * atomic allocations only. + */ + sk->sk_allocation = GFP_ATOMIC; ss_active_guard_exit(SS_V_ACT_NEWCONN); } else if (sk->sk_state == TCP_CLOSE_WAIT) { @@ -1079,20 +1078,6 @@ ss_tcp_state_change(struct sock *sk) } } -void -ss_proto_init(SsProto *proto, const SsHooks *hooks, int type) -{ - proto->hooks = hooks; - proto->type = type; - - /* - * The memory allocated for @proto should be already zero'ed, so don't - * initialize this field to NULL, but instead check the invariant. - */ - WARN_ON_ONCE(proto->listener); -} -EXPORT_SYMBOL(ss_proto_init); - /** * Make data socket serviced by synchronous sockets. * @@ -1126,12 +1111,9 @@ EXPORT_SYMBOL(ss_set_callbacks); void ss_set_listen(struct sock *sk) { - ((SsProto *)sk->sk_user_data)->listener = sk; - sk->sk_state_change = ss_tcp_state_change; sock_set_flag(sk, SOCK_TEMPESTA); } -EXPORT_SYMBOL(ss_set_listen); /* * Create a new socket for IPv4 or IPv6 protocol. The original functions @@ -1547,7 +1529,6 @@ ss_wait_newconn(void) } } } -EXPORT_SYMBOL(ss_wait_newconn); /** * Wait until there are no queued works and no running tasklets. diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index f7123ad76..0cc404bfe 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -157,16 +157,17 @@ tfw_cli_conn_send(TfwCliConn *cli_conn, TfwMsg *msg) return r; } +static const SsHooks *tfw_sock_clnt_hooks(int type); + /** * This hook is called when a new client connection is established. */ static int tfw_sock_clnt_new(struct sock *sk) { - int r = -ENOMEM; + int r = -ENOMEM, type; TfwClient *cli; TfwConn *conn; - SsProto *listen_sock_proto; TfwAddr addr; T_DBG3("new client socket: sk=%p, state=%u\n", sk, sk->sk_state); @@ -178,7 +179,7 @@ tfw_sock_clnt_new(struct sock *sk) * from referencing TfwListenSock{} while a new TfwConn{} object * is not yet allocated/initialized. */ - listen_sock_proto = sk->sk_user_data; + type = (long)sk->sk_user_data; tfw_connection_unlink_from_sk(sk); ss_getpeername(sk, &addr); @@ -188,13 +189,13 @@ tfw_sock_clnt_new(struct sock *sk) return -ENOENT; } - conn = (TfwConn *)tfw_cli_conn_alloc(listen_sock_proto->type); + conn = (TfwConn *)tfw_cli_conn_alloc(type); if (!conn) { T_ERR("can't allocate a new client connection\n"); goto err_client; } - ss_proto_inherit(listen_sock_proto, &conn->proto); + ss_proto_init(&conn->proto, tfw_sock_clnt_hooks(type), type); BUG_ON(!(conn->proto.type & Conn_Clnt)); conn->destructor = (void *)tfw_cli_conn_release; @@ -297,6 +298,24 @@ static const SsHooks tfw_sock_tls_clnt_ss_hooks = { .connection_recv = tfw_tls_connection_recv, }; +static const SsHooks * +tfw_sock_clnt_hooks(int type) +{ + switch (type) { + case TFW_FSM_HTTP: + return &tfw_sock_http_clnt_ss_hooks; + case TFW_FSM_HTTPS: + case TFW_FSM_H2: + /* + * We call the same TLS hooks before generic HTTP processing + * for both the HTTP/1 and HTTP/2. + */ + return &tfw_sock_tls_clnt_ss_hooks; + default: + BUG(); + } +} + static int __cli_conn_close_cb(TfwConn *conn) { @@ -383,23 +402,7 @@ static int tfw_listen_sock_add(const TfwAddr *addr, int type) { TfwListenSock *ls; - const SsHooks *shooks; - - switch (type) { - case TFW_FSM_HTTP: - shooks = &tfw_sock_http_clnt_ss_hooks; - break; - case TFW_FSM_HTTPS: - case TFW_FSM_H2: - /* - * We call the same TLS hooks before generic HTTP processing - * for both the HTTP/1 and HTTP/2. - */ - shooks = &tfw_sock_tls_clnt_ss_hooks; - break; - default: - return -EINVAL; - } + const SsHooks *shooks = tfw_sock_clnt_hooks(type); /* Is there such an address on the list already? */ list_for_each_entry(ls, &tfw_listen_socks_reconf, list) { @@ -471,15 +474,19 @@ tfw_listen_sock_start(TfwListenSock *ls) /* * Link the new socket and TfwListenSock. - * That must be done before calling ss_set_listen() that uses SsProto. + * + * sk_user_data for listening sockets is used as an inherited type for + * children sockets only, so we just store the socket type here. + * This way initialization of passively open sockets doesn't depend + * on the listening socket, which migh be closed during a new connection + * establishing. + * + * When a listening socket is closed, the children sockets migh live for + * an unlimited time. */ ls->sk = sk; - sk->sk_user_data = ls; + sk->sk_user_data = (void *)(long)ls->proto.type; - /* - * For listening sockets we use - * ss_set_listen() instead of ss_set_callbacks(). - */ ss_set_listen(sk); inet_sk(sk)->freebind = 1; @@ -718,9 +725,16 @@ tfw_sock_clnt_start(void) listen_socks_sz--; list_del(&ls->list); - if (!ls->sk) - continue; - ss_release(ls->sk); + if (ls->sk) { + ss_release(ls->sk); + /* + * There is at least one listener, which we need to close, so + * wait while all new connections finish before freeing the + * listeners. This prevents racing of the function with + * tfw_sock_clnt_new(). + */ + ss_wait_newconn(); + } kfree(ls); } @@ -748,7 +762,10 @@ tfw_sock_clnt_stop(void) might_sleep(); - /* Stop listening sockets. */ + /* + * Stop listening sockets, but leave them in the list to bve freed by + * tfw_cfgop_cleanup_sock_clnt(). + */ list_for_each_entry(ls, &tfw_listen_socks, list) { if (!ls->sk) continue; diff --git a/fw/sync_socket.h b/fw/sync_socket.h index b9720947a..3d2a1950c 100644 --- a/fw/sync_socket.h +++ b/fw/sync_socket.h @@ -31,7 +31,6 @@ /* Protocol descriptor. */ typedef struct ss_proto_t { const struct ss_hooks *hooks; - struct sock *listener; int type; } SsProto; @@ -106,9 +105,10 @@ ss_sock_live(struct sock *sk) } static inline void -ss_proto_inherit(const SsProto *parent, SsProto *child) +ss_proto_init(SsProto *proto, const SsHooks *hooks, int type) { - *child = *parent; + proto->hooks = hooks; + proto->type = type; } /* Synchronous operation required. */ @@ -127,10 +127,6 @@ ss_proto_inherit(const SsProto *parent, SsProto *child) #define SS_SKB_TYPE2F(t) (((int)(t)) << 8) #define SS_SKB_F2TYPE(f) ((f) >> 8) -int ss_hooks_register(SsHooks* hooks); -void ss_hooks_unregister(SsHooks* hooks); - -void ss_proto_init(SsProto *proto, const SsHooks *hooks, int type); void ss_set_callbacks(struct sock *sk); void ss_set_listen(struct sock *sk); int ss_send(struct sock *sk, struct sk_buff **skb_head, int flags); diff --git a/fw/t/bomber.c b/fw/t/bomber.c index 05d122960..32a138513 100644 --- a/fw/t/bomber.c +++ b/fw/t/bomber.c @@ -120,7 +120,6 @@ static TfwBmbTask *bmb_tasks; static inline void __check_conn(TfwBmbConn *conn) { - BUG_ON(conn->proto.listener); BUG_ON(conn->proto.hooks != &bmb_hooks); BUG_ON(!conn->sk); BUG_ON(!conn->task); diff --git a/fw/t/sync_sockets/Makefile b/fw/t/sync_sockets/Makefile deleted file mode 100644 index 8bc79daa5..000000000 --- a/fw/t/sync_sockets/Makefile +++ /dev/null @@ -1,59 +0,0 @@ -# Performance test for Synchronous Sockets. -# -# Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). -# -# This program is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, -# or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. -# See the GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along with -# this program; if not, write to the Free Software Foundation, Inc., 59 -# Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -ifdef MSGSIZE - DEFINES = -DMSG_SZ=$(MSGSIZE) -else - DEFINES = -DMSG_SZ=64 -endif -ifdef KERNEL -KVERSION = $(KERNEL) -else -KVERSION = $(shell uname -r) -endif -KBUILD_EXTRA_SYMBOLS = $(PWD)/../Module.symvers - -ifndef CXX - CXX = g++ -endif - -export DEFINES KBUILD_EXTRA_SYMBOLS - -TARGETS = client server -CXXFLAGS = -O2 -std=gnu++0x -ggdb -Wall -Werror -LDFLAGS = -lpthread - -all : $(TARGETS) kernel - -client : client.o - $(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) - -server : server.o - $(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) - -kernel : FORCE - $(MAKE) -C /lib/modules/$(KVERSION)/build M=$(PWD)/kernel modules - -%.o : %.cc $(HEADERS) - $(CXX) $(CXXFLAGS) -c $< -o $@ $(DEFINES) - -clean : FORCE - rm -f *.o $(TARGETS) - $(MAKE) -C /lib/modules/$(KVERSION)/build M=$(PWD)/kernel clean - -FORCE : diff --git a/fw/t/sync_sockets/client.cc b/fw/t/sync_sockets/client.cc deleted file mode 100644 index 1ecdd61eb..000000000 --- a/fw/t/sync_sockets/client.cc +++ /dev/null @@ -1,121 +0,0 @@ -/** - * Multithreaded client for performance testing of Synchronous Socket API. - * - * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, - * or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -static const size_t THR_N = 16; -static const size_t CONNECTIONS = 64; // connections per thread -static const size_t MESSAGES = 4096; -static int msg[MSG_SZ]; -static unsigned short PORT = 5000; -static struct sockaddr_in saddr = {}; - -void -run_tcp_load() -{ - int sd[CONNECTIONS] = { 0 }; - - for (size_t i = 0; i < CONNECTIONS; ++i) { - sd[i] = socket(PF_INET, SOCK_STREAM, 0); - if (sd[i] < 0) { - sd[i] = 0; - std::cerr << "can't create socket #" << i << std::endl; - } - - // Send segments as soon as possible. - const int o = 1; - if (setsockopt(sd[i], IPPROTO_TCP, TCP_NODELAY, &o, sizeof(o))) - std::cerr << "can't set TCP_NODELAY on socket #" << i - << std::endl; - - if (connect(sd[i], (struct sockaddr *)&saddr, sizeof(saddr))) { - std::cerr << "can't connect on socket #" << i - << std::endl; - close(sd[i]); - sd[i] = 0; - } - } - - for (size_t i = 0; i < CONNECTIONS; ++i) { - if (!sd[i]) - continue; - for (size_t m = 0; m < MESSAGES; ++m) - if (send(sd[i], msg, sizeof(msg), 0) != sizeof(msg)) { - std::cerr << "can't send on socket #" - << i << std::endl; - close(sd[i]); - sd[i] = 0; - } - } - - for (size_t i = 0; i < CONNECTIONS; ++i) - if (sd[i]) - close(sd[i]); -} - -int -main(int argc, char *argv[]) -{ - if (argc < 2) { - std::cerr << "Please specify server address" << std::endl; - return 1; - } - - struct rlimit rlim; - if (getrlimit(RLIMIT_NOFILE, &rlim)) { - std::cerr << "getrlimit() failed" << std::endl; - } else { - if (rlim.rlim_cur < THR_N * CONNECTIONS - || rlim.rlim_max < THR_N * CONNECTIONS) - { - std::cerr << "please adjust limit of open files to " - << THR_N * CONNECTIONS << std::endl; - return 2; - } - } - - saddr.sin_family = AF_INET; - saddr.sin_port = htons(PORT); - if (inet_pton(AF_INET, argv[1], &saddr.sin_addr.s_addr) <= 0) { - std::cerr << "Bad address: " << argv[1] << std::endl; - return 3; - } - - // Initialize message - the same for all transmission. - for (size_t i = 0; i < MSG_SZ; ++i) - msg[i] = i; - - std::thread thr[THR_N]; - for (size_t i = 0; i < THR_N; ++i) - thr[i] = std::thread{ run_tcp_load }; - for (size_t i = 0; i < THR_N; ++i) - thr[i].join(); - - return 0; -} diff --git a/fw/t/sync_sockets/kernel/Makefile b/fw/t/sync_sockets/kernel/Makefile deleted file mode 100644 index a5013e0e1..000000000 --- a/fw/t/sync_sockets/kernel/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -EXTRA_CFLAGS = $(DEFINES) -I$(src)/../../.. - -obj-m = kserver.o ss_kserver.o ss_kclient.o - -ss_kserver-objs = \ - sync_kserver.o \ - ../../../sock.o - -ss_kclient-objs = \ - sync_kclient.o \ - ../../../addr.o - -all: - $(MAKE) -C /lib/modules/$(KVERSION)/build M=$(PWD) modules - -clean: - $(MAKE) -C /lib/modules/$(KVERSION)/build M=$(PWD) clean diff --git a/fw/t/sync_sockets/kernel/kserver.c b/fw/t/sync_sockets/kernel/kserver.c deleted file mode 100644 index 8dab1f501..000000000 --- a/fw/t/sync_sockets/kernel/kserver.c +++ /dev/null @@ -1,365 +0,0 @@ -/** - * Multiplexing kernel server for performance testing of Synchronous Socket API. - * - * The code is mostly inspired by Oracle RDS (linux/net/rds). - * - * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). - * Copyright (C) 2018 Tempesta Technologies, Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, - * or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MAX_CONN (1000 * 1000) -#define PORT 5000 -#define READ_SZ (MSG_SZ * sizeof(int)) - -typedef struct { - struct work_struct work; - struct socket *sk; -} SocketWork; - -static void kserver_accept_worker(struct work_struct *); -static void kserver_read_worker(struct work_struct *); - -static struct socket *listen_sock; -static struct workqueue_struct *kserver_wq; -static struct kmem_cache *sw_cache; - -/* Statistics */ -static long last_ts = 0; -static unsigned int pps_curr = 0, pps_max = 0; -static DEFINE_SPINLOCK(stat_lock); - -static int stop = 0; -static atomic_t works = ATOMIC_INIT(0); /* number of works in progress */ -static int msg_buf[MSG_SZ]; -static int g_counter; - -static atomic_t conn_i = ATOMIC_INIT(0); -static struct socket *conn[MAX_CONN] = { NULL }; - -MODULE_LICENSE("GPL"); - -static void -stat_update(int events) -{ - spin_lock(&stat_lock); - if (last_ts == jiffies / HZ) { - pps_curr += events; - } else { - // recharge - if (pps_curr > pps_max) - pps_max = pps_curr; - pps_curr = events; - last_ts = jiffies / HZ; - } - spin_unlock(&stat_lock); -} - -void -stat_print(void) -{ - printk(KERN_ERR "Best rps: %lu\n", - (pps_curr > pps_max ? pps_curr : pps_max) / READ_SZ); -} - -static void -kserver_do_socket_read(struct socket *sock) -{ - int r, count = 0; - do { - struct msghdr msg = { - .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL - }; - struct kvec iov = { msg_buf, READ_SZ }; - - r = kernel_recvmsg(sock, &msg, &iov, 1, READ_SZ, - msg.msg_flags); - if (r >= 0) { - // Just do some useless work. - int i; - for (i = 0; i < r / 4; ++i) - g_counter += msg_buf[i]; - count += r; - } else if (r != -EAGAIN) - printk(KERN_ERR "error (%d) on socket %p\n", r, sock); - } while (r > 0); - - stat_update(count); -} - -static void -kserver_read_worker(struct work_struct *work) -{ - SocketWork *sw = (SocketWork *)work; - - BUG_ON(!sw->sk); - - kserver_do_socket_read(sw->sk); - - kmem_cache_free(sw_cache, sw); - atomic_dec(&works); -} - -static void -kserver_read_data_ready(struct sock *sk) -{ - SocketWork *sw; - - atomic_inc(&works); - if (stop) { - atomic_dec(&works); - goto out; - } - - sw = kmem_cache_alloc(sw_cache, GFP_ATOMIC); - if (!sw) { - printk(KERN_ERR "Can't allocate read work\n"); - atomic_dec(&works); - goto out; - } - INIT_WORK(&sw->work, kserver_read_worker); - sw->sk = sk->sk_socket; - - BUG_ON(!sk->sk_socket->ops); - - read_lock(&sk->sk_callback_lock); - - queue_work(kserver_wq, &sw->work); - - read_unlock(&sk->sk_callback_lock); - -out: - return; -} - -static void -kserver_state_change(struct sock *sk) -{ - read_lock(&sk->sk_callback_lock); - - switch (sk->sk_state) { - case TCP_CLOSE: - stat_update(READ_SZ); - default: - break; - } - - read_unlock(&sk->sk_callback_lock); -} - -static int -kserver_accept(struct socket *sock) -{ - struct socket *new_sock = NULL; - int ci, r = 1; - - if (stop) - goto out; - - r = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, - sock->sk->sk_protocol, &new_sock); - if (r) - goto out; - - new_sock->type = sock->type; - new_sock->ops = sock->ops; - r = sock->ops->accept(sock, new_sock, O_NONBLOCK); - if (r < 0) - goto err; - - write_lock_bh(&new_sock->sk->sk_callback_lock); - new_sock->sk->sk_state_change = kserver_state_change; - write_unlock_bh(&new_sock->sk->sk_callback_lock); - - /* Write the socket to free it as module exit. */ - ci = atomic_inc_return(&conn_i); - if (ci < MAX_CONN) { - conn[ci] = new_sock; - } else { - printk(KERN_ERR "Too many connections!\n"); - } - - /* Check whether the socket has some data to read. */ - kserver_do_socket_read(new_sock); - - return 0; -err: - if (new_sock) - sock_release(new_sock); -out: - return r; -} - -static void -kserver_accept_worker(struct work_struct *work) -{ - SocketWork *sw = (SocketWork *)work; - - BUG_ON(sw->sk != listen_sock); - - while (!kserver_accept(sw->sk)) { - stat_update(READ_SZ); - cond_resched(); - } - - kmem_cache_free(sw_cache, sw); - atomic_dec(&works); -} - -static void -kserver_listen_data_ready(struct sock *sk) -{ - SocketWork *sw; - - atomic_inc(&works); - - sw = kmem_cache_alloc(sw_cache, GFP_ATOMIC); - if (!sw) { - printk(KERN_ERR "Can't allocate accept work\n"); - atomic_dec(&works); - return; - } - INIT_WORK(&sw->work, kserver_accept_worker); - sw->sk = listen_sock; - - read_lock(&sk->sk_callback_lock); - - if (sk->sk_state == TCP_LISTEN) { - queue_work(kserver_wq, &sw->work); - } else { - kmem_cache_free(sw_cache, sw); - atomic_dec(&works); - } - - read_unlock(&sk->sk_callback_lock); -} - -static void -kserver_data_ready(struct sock *sk, int bytes __attribute__((unused))) -{ - if (!sk->sk_socket) - /* - * Just established, not fully initialized, socket. - * Now we can't read from it, but we'll drain its receive - * queue just when it's fully initialized in kserver_accept(). - */ - return; - - if (sk->sk_socket == listen_sock) - kserver_listen_data_ready(sk); - else - /* - * We process child socket data in parent callback - * to avoid absence of proper callback on data arriving - * due to registration of callback after accepting the socket. - */ - kserver_read_data_ready(sk); -} - -int __init -kserver_init(void) -{ - int r = -ENOMEM; - struct sockaddr_in saddr; - - sw_cache = kmem_cache_create("kserver_work_cache", sizeof(SocketWork), - 0, 0, NULL); - if (!sw_cache) { - printk(KERN_ERR "Can't create read work cache\n"); - return r; - } - - kserver_wq = create_singlethread_workqueue("kserverd"); - if (!kserver_wq) { - printk(KERN_ERR "Can't create workqueue\n"); - goto err; - } - - r = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &listen_sock); - if (r) { - printk(KERN_ERR "Can't listening socket\n"); - goto err_sock; - } - - inet_sk(listen_sock->sk)->freebind = 1; - listen_sock->sk->sk_reuse = 1; - - write_lock_bh(&listen_sock->sk->sk_callback_lock); - listen_sock->sk->sk_data_ready = kserver_data_ready; - write_unlock_bh(&listen_sock->sk->sk_callback_lock); - - memset(&saddr, 0, sizeof(saddr)); - saddr.sin_family = AF_INET; - saddr.sin_addr.s_addr = htonl(INADDR_ANY); - saddr.sin_port = htons(PORT); - - r = listen_sock->ops->bind(listen_sock, (struct sockaddr *)&saddr, - sizeof(saddr)); - if (r) { - printk(KERN_ERR "Can't bind listening socket\n"); - goto err_call; - } - - r = listen_sock->ops->listen(listen_sock, 1000); - if (r) { - printk(KERN_ERR "Can't listen on socket\n"); - goto err_call; - } - - return 0; -err_call: - sock_release(listen_sock); -err_sock: - destroy_workqueue(kserver_wq); -err: - kmem_cache_destroy(sw_cache); - return r; -} - -void __exit -kserver_exit(void) -{ - int ci; - - stop = 1; - - sock_release(listen_sock); - for (ci = 0; ci < atomic_read(&conn_i); ++ci) - if (conn[ci]) - sock_release(conn[ci]); - - while (atomic_read(&works)) - schedule(); - - stat_print(); - - destroy_workqueue(kserver_wq); - kmem_cache_destroy(sw_cache); -} - -module_init(kserver_init); -module_exit(kserver_exit); diff --git a/fw/t/sync_sockets/kernel/sync_kclient.c b/fw/t/sync_sockets/kernel/sync_kclient.c deleted file mode 100644 index c0c154725..000000000 --- a/fw/t/sync_sockets/kernel/sync_kclient.c +++ /dev/null @@ -1,338 +0,0 @@ -/* - * A client for testing Synchronous Sockets connect() that does not sleep. - * - * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). - * Copyright (C) 2015-2018 Tempesta Technologies, Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, - * or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include -#include -#include -#include -#include -#include -#include - -#include "addr.h" -#include "log.h" -#include "sync_socket.h" - -/* - * Start KCLIENT_NTHREADS threads. Each thread initiates KCLIENT_NCONNECTS - * connects to a remote server, and finishes. Errors in this process are - * marked with a KCLIENT_CONNECT_ERROR flag, which may be used for extra - * reporting. Successful connect attempts are counted. - * - * Another thread is started that waits on these connect attempts to finish. - * - * As each connect attempt finishes after 3WHS, an SS hook is invoked that - * marks the connection as established. - * - * The waiting thread waits on all successful connects attempts to complete. - * After that, it closes all open connections. - * - * In the end, when the module is unloaded, a stat report is printed. - */ -#ifdef SS_BANNER -#undef SS_BANNER -#endif -#define SS_BANNER "[kclient] " - -#define KCLIENT_NTHREADS (16) -#define KCLIENT_NCONNECTS (64) -#define KCLIENT_WAIT_INTVL (2) /* in seconds */ -#define KCLIENT_WAIT_MAX (1 * 60) /* in seconds */ - -/* Flags for kclient_desc_t.flags */ -#define KCLIENT_CONNECT_STARTED (0x0001) -#define KCLIENT_CONNECT_ESTABLISHED (0x0002) -#define KCLIENT_CONNECT_CLOSED (0x0004) -#define KCLIENT_CONNECT_ERROR (0x0100) - -typedef struct kclient_desc { - SsProto proto; - struct sock *sk; - uint32_t flags; -} kclient_desc_t; - -/* - * There's a descriptor for each connection that keeps the connection's - * state and status. All descriptors are kept in a static two-dimensional - * array. SsProto.type field is used here to store the index into that - * array that can be passed around between callbacks. - */ -static kclient_desc_t kclient_desc[KCLIENT_NTHREADS][KCLIENT_NCONNECTS]; -static struct task_struct *kclient_connect_task[KCLIENT_NTHREADS]; -DECLARE_WAIT_QUEUE_HEAD(kclient_connect_wq); -static atomic_t kclient_nthreads; - -static struct task_struct *kclient_finish_task; -DECLARE_WAIT_QUEUE_HEAD(kclient_finish_wq); - -static atomic_t kclient_connect_nattempt; /* Successful attempts */ -static atomic_t kclient_connect_ncomplete; /* Connections established */ -static atomic_t kclient_connect_nclose; /* Connections closed */ -static atomic_t kclient_connect_nerror; /* Number of errors */ - -static char *server = "127.0.0.1:5000"; -static TfwAddr kclient_server_address; -static SsHooks kclient_hooks; - -module_param(server, charp, 0); -MODULE_PARM_DESC(server, "Server host address and optional port number"); -MODULE_LICENSE("GPL"); - -static int -kclient_connect(int descidx) -{ - int ret; - struct sock *sk; - kclient_desc_t *desc = *(kclient_desc + descidx / KCLIENT_NCONNECTS) - + descidx % KCLIENT_NCONNECTS; - - ret = ss_sock_create(kclient_server_address.sa.sa_family, - SOCK_STREAM, IPPROTO_TCP, &sk); - if (ret) { - SS_DBG("Unable to create kernel socket (%d)\n", ret); - desc->flags |= KCLIENT_CONNECT_ERROR; - atomic_inc(&kclient_connect_nerror); - return ret; - } - ss_proto_init(&desc->proto, &kclient_hooks, descidx); - sk->sk_user_data = &desc->proto; - ss_set_callbacks(sk); - ret = ss_connect(sk, &kclient_server_address.sa, - tfw_addr_sa_len(&kclient_server_address), 0); - if (ret) { - SS_DBG("Connect error on server socket sk %p (%d)\n", sk, ret); - ss_release(sk); - desc->flags |= KCLIENT_CONNECT_ERROR; - atomic_inc(&kclient_connect_nerror); - return ret; - } - desc->sk = sk; - desc->flags |= KCLIENT_CONNECT_STARTED; - atomic_inc(&kclient_connect_nattempt); - return 0; -} - -static int -kclient_connect_complete(struct sock *sk) -{ - int descidx; - kclient_desc_t *desc; - SsProto *proto = sk->sk_user_data; - - BUG_ON(proto == NULL); - - descidx = proto->type; - desc = *(kclient_desc + descidx / KCLIENT_NCONNECTS) - + descidx % KCLIENT_NCONNECTS; - BUG_ON(desc->proto.type != descidx); - BUG_ON(desc->proto.listener != NULL); - BUG_ON(desc->proto.hooks != &kclient_hooks); - BUG_ON(desc->sk && (desc->sk != sk)); - - desc->flags |= KCLIENT_CONNECT_ESTABLISHED; - atomic_inc(&kclient_connect_ncomplete); - wake_up(&kclient_finish_wq); - return 0; -} - -static int -kclient_connection_close(struct sock *sk) -{ - int descidx; - kclient_desc_t *desc; - SsProto *proto = sk->sk_user_data; - - BUG_ON(proto == NULL); - - descidx = proto->type; - desc = *(kclient_desc + descidx / KCLIENT_NCONNECTS) - + descidx % KCLIENT_NCONNECTS; - BUG_ON(desc->proto.type != descidx); - BUG_ON(desc->proto.listener != NULL); - BUG_ON(desc->proto.hooks != &kclient_hooks); - BUG_ON(desc->sk && (desc->sk != sk)); - - desc->sk = NULL; - desc->flags |= KCLIENT_CONNECT_CLOSED; - atomic_inc(&kclient_connect_nclose); - wake_up(&kclient_finish_wq); - return 0; -} - -static SsHooks kclient_hooks = { - .connection_new = kclient_connect_complete, - .connection_drop = kclient_connection_close, -}; - -static void -kclient_report(void) -{ - SS_ERR("Initiated %d connects\n", - KCLIENT_NTHREADS * KCLIENT_NCONNECTS); - SS_ERR("Of those %d connects initiated successfully\n", - atomic_read(&kclient_connect_nattempt)); - SS_ERR("Of those %d connections were established successfully\n", - atomic_read(&kclient_connect_ncomplete)); - SS_ERR("and %d connections completed with error\n", - atomic_read(&kclient_connect_nerror)); -} - -static void -kclient_release_sockets(void) -{ - int i, k; - - for (i = 0; i < KCLIENT_NTHREADS; i++) { - for (k = 0; k < KCLIENT_NCONNECTS; k++) { - if (kclient_desc[i][k].sk) { - ss_release(kclient_desc[i][k].sk); - kclient_desc[i][k].sk = NULL; - } - } - } -} - -static int -kclient_thread_finish(void *data) -{ - int nattempt = atomic_read(&kclient_connect_nattempt); - uint64_t time_max = (uint64_t)get_seconds() + KCLIENT_WAIT_MAX; - - set_freezable(); - do { - long timeout = KCLIENT_WAIT_INTVL; - int nerror = atomic_read(&kclient_connect_nerror); - int ncomplete = atomic_read(&kclient_connect_ncomplete); - - if (ncomplete + nerror == nattempt) { - break; - } - wait_event_freezable_timeout(kclient_finish_wq, - kthread_should_stop(), - timeout); - if ((uint64_t)get_seconds() > time_max) { - SS_ERR("%s exceeded maximum wait time of %d seconds\n", - "kclient_thread_finish", KCLIENT_WAIT_MAX); - break; - } - } while (!kthread_should_stop()); - - kclient_release_sockets(); - kclient_finish_task = NULL; - return 0; -} - -static int -kclient_thread_connect(void *data) -{ - int i, nconnects = 0; - int threadn = (int)(long)data; - int descidx = threadn * KCLIENT_NCONNECTS; - - SS_DBG("connect_thread_%02d started\n", threadn); - for (i = 0; i < KCLIENT_NCONNECTS; i++) { - if (kclient_connect(descidx + i) == 0) { - nconnects++; - } - } - kclient_connect_task[threadn] = NULL; - atomic_dec(&kclient_nthreads); - wake_up(&kclient_connect_wq); - SS_DBG("Thread %d has initiated %d connects out of %d\n", - threadn, nconnects, KCLIENT_NCONNECTS); - return 0; -} - -static void -kclient_stop_threads(void) -{ - int i; - - for (i = 0; i < KCLIENT_NTHREADS; i++) { - if (kclient_connect_task[i]) { - kthread_stop(kclient_connect_task[i]); - kclient_connect_task[i] = NULL; - } - } - if (kclient_finish_task) { - kthread_stop(kclient_finish_task); - kclient_finish_task = NULL; - } - kclient_release_sockets(); -} - -static int __init -kclient_init(void) -{ - int i, ret = 0; - struct task_struct *task; - - if (tfw_addr_pton(server, &kclient_server_address)) { - SS_ERR("Unable to parse server's address: %s", server); - return -EINVAL; - } - SS_ERR("Started kclient module, server's address is %s\n", server); - - task = kthread_create(kclient_thread_finish, 0, - "kclient_thread_finish"); - if (IS_ERR_OR_NULL(task)) { - ret = PTR_ERR(task); - SS_ERR("Unable to create thread: %s (%d)\n", - "kclient_finish_task", ret); - return ret; - } - kclient_finish_task = task; - - for (i = 0; i < KCLIENT_NTHREADS; i++) { - task = kthread_create(kclient_thread_connect, (void *)(long)i, - "kclient_thread_connect_%02d", i); - if (IS_ERR_OR_NULL(task)) { - ret = PTR_ERR(task); - SS_ERR("Unable to create a thread: %s%02d (%d)\n", - "kclient_thread_connect", i, ret); - break; - } - kclient_connect_task[i] = task; - } - if (ret) { - kclient_stop_threads(); - } else { - atomic_set(&kclient_nthreads, KCLIENT_NTHREADS); - for (i = 0; i < KCLIENT_NTHREADS; i++) { - wake_up_process(kclient_connect_task[i]); - } - SS_ERR("Started %d threads to initiate %d connects each\n", - KCLIENT_NTHREADS, KCLIENT_NCONNECTS); - wait_event_interruptible(kclient_connect_wq, - atomic_read(&kclient_nthreads) == 0); - wake_up_process(kclient_finish_task); - } - return ret; -} - -static void -kclient_exit(void) -{ - kclient_stop_threads(); - kclient_report(); -} - -module_init(kclient_init); -module_exit(kclient_exit); diff --git a/fw/t/sync_sockets/kernel/sync_kserver.c b/fw/t/sync_sockets/kernel/sync_kserver.c deleted file mode 100644 index a5dcffaa5..000000000 --- a/fw/t/sync_sockets/kernel/sync_kserver.c +++ /dev/null @@ -1,194 +0,0 @@ -/** - * Multiplexing kernel server using synchronous sockets for performance testing - * of Synchronous Socket API. - * - * It works fully in softirq context as opposed to kserver working mostly in - * kworker threads. - * - * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). - * Copyright (C) 2015-2022 Tempesta Technologies, Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, - * or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include -#include -#include -#include -#include - -#include "sync_socket.h" - -#define MAX_CONN (1000 * 1000) -#define PORT 5000 -#define READ_SZ (MSG_SZ * sizeof(int)) - -/* Application logic class inherited from SsProto. */ -typedef struct { - SsProto proto; -} MyProto; - -static MyProto my_proto; - -/* Statistics */ -static long last_ts = 0; -static unsigned int pps_curr = 0, pps_max = 0; - -static int g_counter; - -static atomic_t conn_i = ATOMIC_INIT(0); -static struct sock *conn[MAX_CONN] = { NULL }; - -MODULE_LICENSE("GPL"); - -static void -stat_update(int events) -{ - /* Only one softirq context, so no synchronization is needed. */ - if (last_ts == jiffies / HZ) { - pps_curr += events; - } else { - // recharge - if (pps_curr > pps_max) - pps_max = pps_curr; - pps_curr = events; - last_ts = jiffies / HZ; - } -} - -void -stat_print(void) -{ - printk(KERN_ERR "Best rps: %lu\n", - (pps_curr > pps_max ? pps_curr : pps_max) / READ_SZ); -} - -/* - * Just do some useless work. - */ -static int -kserver_read(struct sock *sk, unsigned char *data, size_t len) -{ - int i; - for (i = 0; i < len / 4; ++i) - g_counter += data[i]; - - stat_update(len); - - return 0; -} - -static int -kserver_connection_new(struct sock *sk) -{ - int ci; - - BUG_ON(!sk->sk_user_data); - - /* TODO Typically we should allocate a new connection here. */ - - /* Write the socket to free it as module exit. */ - ci = atomic_inc_return(&conn_i); - if (ci < MAX_CONN) { - conn[ci] = sk; - } else { - printk(KERN_ERR "Too many connections!\n"); - } - - stat_update(READ_SZ); - - return 0; -} - -static int -kserver_connection_drop(struct sock *sk) -{ - stat_update(READ_SZ); - - return 0; -} - -static SsHooks ssocket_hooks = { - .connection_new = kserver_connection_new, - .connection_drop = kserver_connection_drop, - .connection_recv = kserver_read, -}; - -int __init -kserver_init(void) -{ - int r; - struct sock *lsk; - struct sockaddr_in saddr; - - r = ss_sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, &lsk); - if (r) { - printk(KERN_ERR "Can't listening socket\n"); - goto err_create; - } - - inet_sk(lsk)->freebind = 1; - lsk->sk_reuse = 1; - - /* Set TCP handlers. */ - ss_proto_init((SsProto *)&my_proto, &ssocket_hooks, 0); - lsk->sk_user_data = (SsProto *)&my_proto; - ss_set_listen(lsk); - - memset(&saddr, 0, sizeof(saddr)); - saddr.sin_family = AF_INET; - saddr.sin_addr.s_addr = htonl(INADDR_ANY); - saddr.sin_port = htons(PORT); - - r = ss_bind(lsk, (struct sockaddr *)&saddr, sizeof(saddr)); - if (r) { - printk(KERN_ERR "Can't bind listening socket\n"); - goto err_call; - } - - r = ss_listen(lsk, 1000); - if (r) { - printk(KERN_ERR "Can't listen on socket\n"); - goto err_call; - } - - return 0; -err_call: - ss_release(lsk); -err_create: - return r; -} - -void __exit -kserver_exit(void) -{ - int ci; - - ss_release(my_proto.proto.listener); - - for (ci = 0; ci < atomic_read(&conn_i); ++ci) - if (conn[ci]) - ss_close_sync(conn[ci], true); - - /* - * TODO at this point the module can crash if there is some active - * softirq processing the sockets which are calling ssocket_hooks - * callbacks. - */ - - stat_print(); -} - -module_init(kserver_init); -module_exit(kserver_exit); diff --git a/fw/t/sync_sockets/server.cc b/fw/t/sync_sockets/server.cc deleted file mode 100644 index 55971a913..000000000 --- a/fw/t/sync_sockets/server.cc +++ /dev/null @@ -1,283 +0,0 @@ -/** - * Multiplexing user-space server for performance testing of - * Synchronous Socket API. - * - * Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). - * Copyright (C) 2018 Tempesta Technologies, Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, - * or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -static const size_t MAX_CONNECTIONS = 1000 * 1000; -static const int READ_SZ = MSG_SZ * sizeof(int); - -/** - * Counts number of requests per second and prints the best one. - */ -class RequestsStatistics { -public: - RequestsStatistics() - : last_ts_(time(NULL)), - curr_(0), - max_(0) - {} - - void - update(int events) - { - time_t t = time(NULL); - if (last_ts_ == t) { - curr_ += events; - } else { - // recharge - if (curr_ > max_) - max_ = curr_; - curr_ = events; - last_ts_ = t; - } - } - - void - print() - { - std::cout << "Best rps: " << (std::max(max_, curr_) / READ_SZ) - << std::endl; - } - -private: - time_t last_ts_; - unsigned int curr_; - unsigned int max_; -}; - -static unsigned short PORT = 5000; -static int msg[MSG_SZ]; -static unsigned int g_counter = 0; -RequestsStatistics stat; - -void -sig_handler(int sig_num) -{ - std::cout << "received signal " << sig_num << std::endl; - exit(0); -} - -void -set_sig_handlers() -{ - struct sigaction sa; - - sigemptyset (&sa.sa_mask); - sigaddset(&sa.sa_mask, SIGHUP); - sigaddset(&sa.sa_mask, SIGINT); - sigaddset(&sa.sa_mask, SIGQUIT); - sigaddset(&sa.sa_mask, SIGPIPE); - sigaddset(&sa.sa_mask, SIGTERM); - sigaddset(&sa.sa_mask, SIGUSR1); - sigaddset(&sa.sa_mask, SIGUSR2); - - sa.sa_handler = sig_handler; - sa.sa_flags = SA_RESTART; - - sigaction(SIGHUP, &sa, NULL); - sigaction(SIGINT, &sa, NULL); - sigaction(SIGQUIT, &sa, NULL); - sigaction(SIGPIPE, &sa, NULL); - sigaction(SIGTERM, &sa, NULL); - sigaction(SIGUSR1, &sa, NULL); - sigaction(SIGUSR2, &sa, NULL); -} - -void -print_statistics() -{ - stat.print(); -} - -int -sd_add_to_epoll(int efd, int sd) -{ - epoll_event ev = {}; - ev.events = EPOLLIN; - ev.data.fd = sd; - if (epoll_ctl(efd, EPOLL_CTL_ADD, sd, &ev) < 0) { - std::cerr << "can't add socket " << sd << " to epoll" - << std::endl; - return 1; - } - return 0; -} - -void -set_nonblock(int sd, const char *desc) -{ - int flags = fcntl(sd, F_GETFL, 0); - if (!(flags & O_NONBLOCK)) - if(fcntl(sd, F_SETFL, flags | O_NONBLOCK) < 0) { - std::cerr << "can't make " << desc - << " socket nonblocking" << std::endl; - exit(1); - } -} - -void -work_loop(int listen_sd, int wd) -{ - struct epoll_event ev[64]; - int n = epoll_wait(wd, ev, 64, -1); - if (n < 1) { - std::cerr << "epoll wait failed" << std::endl; - exit(1); - } - - for (int i = 0; i < n; ++i) { - if (ev[i].data.fd == listen_sd) { - // Process new connection. - while (1) { - int sd = accept(listen_sd, NULL, NULL); - if (sd < 1) { - if (errno == EAGAIN) - break; - std::cerr << "can't accept a socket" - << std::endl; - exit(1); - } - - set_nonblock(sd, "work"); - - if (sd_add_to_epoll(wd, sd)) - exit(1); - - stat.update(READ_SZ); - } - } - else { - // Process data on established connections. - assert(ev[i].events & EPOLLIN); - - int count = 0, r; - do { - r = recv(ev[i].data.fd, msg, READ_SZ, 0); - if (!r) { - epoll_ctl(wd, EPOLL_CTL_DEL, - ev[i].data.fd, NULL); - close(ev[i].data.fd); - count = READ_SZ; - } - else if (r < 0 && errno != EAGAIN) { - std::cerr << "failed to read on" - << " socket " << ev[i].data.fd - << " (ret=" << r << ")" - << std::endl; - exit(1); - } - - // Just do some useless work. - if (r > 0) { - for (int j = 0; j < r / 4; ++j) - g_counter += msg[j]; - count += r; - } - } while (r > 0); - - stat.update(count); - } - } -} - -int -main(int argc, char *argv[]) -{ - struct rlimit rlim; - if (getrlimit(RLIMIT_NOFILE, &rlim)) { - std::cerr << "getrlimit() failed" << std::endl; - } else { - if (rlim.rlim_cur < MAX_CONNECTIONS - || rlim.rlim_max < MAX_CONNECTIONS) - { - std::cerr << "please adjust limit of open files to " - << MAX_CONNECTIONS << std::endl; - exit(1); - } - } - - set_sig_handlers(); - atexit(print_statistics); - - int listen_sd = socket(PF_INET, SOCK_STREAM, 0); - if (listen_sd < 0) { - std::cerr << "can't create listening socket" << std::endl; - exit(1); - } - - static const int on = 1; - if (setsockopt(listen_sd, SOL_SOCKET, SO_REUSEADDR, (char *)&on, - sizeof(on)) < 0) - { - std::cerr << "can't set reuseaddr for listening socket" - << std::endl; - exit(1); - } - - struct sockaddr_in saddr = {}; - saddr.sin_family = AF_INET; - saddr.sin_addr.s_addr = INADDR_ANY; - saddr.sin_port = htons(PORT); - if (bind(listen_sd, (const sockaddr *)&saddr, sizeof(saddr))) { - std::cerr << "can't bind listening socket " << std::endl; - exit(1); - } - - /* - * XXX set /proc/sys/net/core/somaxconn also to 1000. - * See listen(2). - */ - if (listen(listen_sd, 1000)) { - std::cerr << "can't listen on socket" << std::endl; - exit(1); - } - - set_nonblock(listen_sd, "listen"); - - int wd = epoll_create(1000); - if (wd < 0) { - std::cerr << "can't create epoll" << std::endl; - exit(1); - } - - if (sd_add_to_epoll(wd, listen_sd)) - exit(1); - - while (1) - work_loop(listen_sd, wd); - - return 0; -} From 4cfda82d1b48625e1f95b6b883bea7581fb71d7d Mon Sep 17 00:00:00 2001 From: Alexander K Date: Sun, 4 Sep 2022 20:51:16 +0300 Subject: [PATCH 25/26] Multiple fixes: 1. all our sockets have sk->sk_allocation == GFP_ATOMIC, so remove setting of the allocation class and use sk_uid to differentiate client and server sockets in ss_tcp_state_change(). 2. ss_tcp_state_change() uses SsHooks from SsProto, which is supposed to be in sk->sk_user_data to call connection_new hooks, so we do need to keep full SsProto in sk_user_data for listening sockets. I used statically allocates SsProtos to make child sockets independent from parent to avoid the race, attempted to fix in previous commit. 3. remove ss_wait_newconn() call from tfw_sock_clnt_start() - this was just a garbage from invalid way to fix the race in previous commit. 4. remove t/sync_sockets from the Makefile --- fw/sock.c | 15 +++-------- fw/sock_clnt.c | 66 +++++++++++++++++++++++------------------------- fw/sock_srv.c | 1 + fw/sync_socket.h | 6 +++-- fw/t/Makefile | 4 +-- 5 files changed, 42 insertions(+), 50 deletions(-) diff --git a/fw/sock.c b/fw/sock.c index b5aac1637..3762af9ef 100644 --- a/fw/sock.c +++ b/fw/sock.c @@ -967,15 +967,11 @@ ss_tcp_state_change(struct sock *sk) ss_sk_incoming_cpu_update(sk); assert_spin_locked(&sk->sk_lock.slock); TFW_VALIDATE_SK_LOCK_OWNER(sk); + WARN_ON(sk->sk_allocation != GFP_ATOMIC); if (sk->sk_state == TCP_ESTABLISHED) { - /* - * Process the new TCP connection. - * The kernel sets sk_allocation to GFP_KERNEL, so this way we - * cad differentiate server sockets, created by as, and client - * sockets created by the kernel. - */ - bool is_srv_sock = (sk->sk_allocation == GFP_ATOMIC); + /* Process the new TCP connection. */ + bool is_srv_sock = (sk->sk_uid.val == SS_SRV_USER); int r; /* @@ -1028,11 +1024,6 @@ ss_tcp_state_change(struct sock *sk) } sock_set_flag(sk, SOCK_TEMPESTA); - /* - * Tempesta works with sockets in SoftIRQ context, so always use - * atomic allocations only. - */ - sk->sk_allocation = GFP_ATOMIC; ss_active_guard_exit(SS_V_ACT_NEWCONN); } else if (sk->sk_state == TCP_CLOSE_WAIT) { diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index 0cc404bfe..6bff2e939 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -157,15 +157,14 @@ tfw_cli_conn_send(TfwCliConn *cli_conn, TfwMsg *msg) return r; } -static const SsHooks *tfw_sock_clnt_hooks(int type); - /** * This hook is called when a new client connection is established. */ static int tfw_sock_clnt_new(struct sock *sk) { - int r = -ENOMEM, type; + int r = -ENOMEM; + SsProto *proto; TfwClient *cli; TfwConn *conn; TfwAddr addr; @@ -179,7 +178,7 @@ tfw_sock_clnt_new(struct sock *sk) * from referencing TfwListenSock{} while a new TfwConn{} object * is not yet allocated/initialized. */ - type = (long)sk->sk_user_data; + proto = sk->sk_user_data; tfw_connection_unlink_from_sk(sk); ss_getpeername(sk, &addr); @@ -189,13 +188,13 @@ tfw_sock_clnt_new(struct sock *sk) return -ENOENT; } - conn = (TfwConn *)tfw_cli_conn_alloc(type); + conn = (TfwConn *)tfw_cli_conn_alloc(proto->type); if (!conn) { T_ERR("can't allocate a new client connection\n"); goto err_client; } - ss_proto_init(&conn->proto, tfw_sock_clnt_hooks(type), type); + ss_proto_init(&conn->proto, proto->hooks, proto->type); BUG_ON(!(conn->proto.type & Conn_Clnt)); conn->destructor = (void *)tfw_cli_conn_release; @@ -298,22 +297,30 @@ static const SsHooks tfw_sock_tls_clnt_ss_hooks = { .connection_recv = tfw_tls_connection_recv, }; -static const SsHooks * -tfw_sock_clnt_hooks(int type) +/* + * We call the same TLS hooks before generic HTTP processing + * for both the HTTP/1 and HTTP/2. + */ +static const SsProto tfw_sock_listen_protos[] = { + { &tfw_sock_http_clnt_ss_hooks, TFW_FSM_HTTP}, + { &tfw_sock_http_clnt_ss_hooks, Conn_HttpClnt}, + + { &tfw_sock_tls_clnt_ss_hooks, TFW_FSM_HTTPS}, + { &tfw_sock_tls_clnt_ss_hooks, Conn_HttpsClnt}, + + { &tfw_sock_tls_clnt_ss_hooks, TFW_FSM_H2}, + { &tfw_sock_tls_clnt_ss_hooks, Conn_H2Clnt}, +}; + +static const SsProto * +tfw_sock_clnt_protos(int type) { - switch (type) { - case TFW_FSM_HTTP: - return &tfw_sock_http_clnt_ss_hooks; - case TFW_FSM_HTTPS: - case TFW_FSM_H2: - /* - * We call the same TLS hooks before generic HTTP processing - * for both the HTTP/1 and HTTP/2. - */ - return &tfw_sock_tls_clnt_ss_hooks; - default: - BUG(); - } + int i; + + for (i = 0; i < ARRAY_SIZE(tfw_sock_listen_protos); ++i) + if (tfw_sock_listen_protos[i].type == type) + return &tfw_sock_listen_protos[i]; + BUG(); } static int @@ -402,7 +409,7 @@ static int tfw_listen_sock_add(const TfwAddr *addr, int type) { TfwListenSock *ls; - const SsHooks *shooks = tfw_sock_clnt_hooks(type); + const SsHooks *shooks = tfw_sock_clnt_protos(type)->hooks; /* Is there such an address on the list already? */ list_for_each_entry(ls, &tfw_listen_socks_reconf, list) { @@ -475,8 +482,7 @@ tfw_listen_sock_start(TfwListenSock *ls) /* * Link the new socket and TfwListenSock. * - * sk_user_data for listening sockets is used as an inherited type for - * children sockets only, so we just store the socket type here. + * We use static SsProto's for sk_user_data for listening sockets. * This way initialization of passively open sockets doesn't depend * on the listening socket, which migh be closed during a new connection * establishing. @@ -485,7 +491,7 @@ tfw_listen_sock_start(TfwListenSock *ls) * an unlimited time. */ ls->sk = sk; - sk->sk_user_data = (void *)(long)ls->proto.type; + sk->sk_user_data = (SsProto *)tfw_sock_clnt_protos(ls->proto.type); ss_set_listen(sk); @@ -725,16 +731,8 @@ tfw_sock_clnt_start(void) listen_socks_sz--; list_del(&ls->list); - if (ls->sk) { + if (ls->sk) ss_release(ls->sk); - /* - * There is at least one listener, which we need to close, so - * wait while all new connections finish before freeing the - * listeners. This prevents racing of the function with - * tfw_sock_clnt_new(). - */ - ss_wait_newconn(); - } kfree(ls); } diff --git a/fw/sock_srv.c b/fw/sock_srv.c index 3da2e8a8f..0d6f055cc 100644 --- a/fw/sock_srv.c +++ b/fw/sock_srv.c @@ -262,6 +262,7 @@ tfw_sock_srv_connect_try(TfwSrvConn *srv_conn) tfw_connection_link_from_sk((TfwConn *)srv_conn, sk); tfw_connection_link_to_sk((TfwConn *)srv_conn, sk); tfw_srv_conn_init_as_dead(srv_conn); + sk->sk_uid.val = SS_SRV_USER; ss_set_callbacks(sk); /* * Set connection destructor such that connection failover can diff --git a/fw/sync_socket.h b/fw/sync_socket.h index 3d2a1950c..25d682813 100644 --- a/fw/sync_socket.h +++ b/fw/sync_socket.h @@ -111,6 +111,9 @@ ss_proto_init(SsProto *proto, const SsHooks *hooks, int type) proto->type = type; } +/* Dummy user ID to differentiate server from client sockets. */ +#define SS_SRV_USER 0x11223344 + /* Synchronous operation required. */ #define SS_F_SYNC 0x01 /* Keep SKBs (use clones) on sending. */ @@ -149,7 +152,6 @@ void ss_get_stat(SsStat *stat); ? ((SsProto *)(sk)->sk_user_data)->hooks->f(__VA_ARGS__) \ : 0) -#define SS_CONN_TYPE(sk) \ - (((SsProto *)(sk)->sk_user_data)->type) +#define SS_CONN_TYPE(sk) (((SsProto *)(sk)->sk_user_data)->type) #endif /* __SS_SOCK_H__ */ diff --git a/fw/t/Makefile b/fw/t/Makefile index 45ea5f768..3b2248d8c 100644 --- a/fw/t/Makefile +++ b/fw/t/Makefile @@ -1,7 +1,7 @@ # Tempesta FW # # Copyright (C) 2014 NatSys Lab. (info@natsys-lab.com). -# Copyright (C) 2015-2018 Tempesta Technologies, INC. +# Copyright (C) 2015-2022 Tempesta Technologies, INC. # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by @@ -21,7 +21,7 @@ export TFW_CFLAGS EXTRA_CFLAGS += $(TFW_CFLAGS) -I$(src)/.. -I$(src)/../../ EXTRA_CFLAGS += $(TTLS_CFLAGS) -obj-m += sync_sockets/ unit/ +obj-m += unit/ obj-m += tfw_fuzzer.o tfw_fuzzer-objs = \ From 96ba4f570d681935824744d1cd2200f5bca7a19b Mon Sep 17 00:00:00 2001 From: Alexander K Date: Tue, 6 Sep 2022 16:12:58 +0300 Subject: [PATCH 26/26] Fix small typos and a cleanup --- fw/sock_clnt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index 6bff2e939..65a3d2c4f 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -313,7 +313,7 @@ static const SsProto tfw_sock_listen_protos[] = { }; static const SsProto * -tfw_sock_clnt_protos(int type) +tfw_sock_clnt_proto(int type) { int i; @@ -409,7 +409,7 @@ static int tfw_listen_sock_add(const TfwAddr *addr, int type) { TfwListenSock *ls; - const SsHooks *shooks = tfw_sock_clnt_protos(type)->hooks; + const SsHooks *shooks = tfw_sock_clnt_proto(type)->hooks; /* Is there such an address on the list already? */ list_for_each_entry(ls, &tfw_listen_socks_reconf, list) { @@ -484,14 +484,14 @@ tfw_listen_sock_start(TfwListenSock *ls) * * We use static SsProto's for sk_user_data for listening sockets. * This way initialization of passively open sockets doesn't depend - * on the listening socket, which migh be closed during a new connection + * on the listening socket, which might be closed during a new connection * establishing. * - * When a listening socket is closed, the children sockets migh live for + * When a listening socket is closed, the children sockets might live for * an unlimited time. */ ls->sk = sk; - sk->sk_user_data = (SsProto *)tfw_sock_clnt_protos(ls->proto.type); + sk->sk_user_data = (SsProto *)tfw_sock_clnt_proto(ls->proto.type); ss_set_listen(sk);