diff --git a/librdmacm/configure.ac b/librdmacm/configure.ac index 7581df43bf7..b8e9c54d8cd 100644 --- a/librdmacm/configure.ac +++ b/librdmacm/configure.ac @@ -1,7 +1,7 @@ dnl Process this file with autoconf to produce a configure script. AC_PREREQ([2.63]) -AC_INIT([librdmacm],[1.0.18],[linux-rdma@vger.kernel.org]) +AC_INIT([librdmacm],[1.0.19-1],[linux-rdma@vger.kernel.org]) AC_CONFIG_SRCDIR([src/cma.c]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) @@ -40,14 +40,10 @@ dnl Checks for libraries AC_CHECK_LIB(pthread, pthread_mutex_init, [], AC_MSG_ERROR([pthread_mutex_init() not found. librdmacm requires libpthread.])) if test "$disable_libcheck" != "yes"; then -AC_CHECK_LIB(ibverbs, ibv_get_device_list, [], - AC_MSG_ERROR([ibv_get_device_list() not found. librdmacm requires libibverbs.])) +AC_CHECK_LIB(ibverbs, ibv_cmd_open_xrcd, [], + AC_MSG_ERROR([ibv_cmd_open_xrcd() not found. librdmacm requires libibverbs 1.1.8 or later.])) fi -AC_CHECK_MEMBER(struct ibv_path_record.service_id, [], - AC_DEFINE(DEFINE_PATH_RECORD, 1, [adding path record definition]), - [#include ]) - dnl Check for gcc atomic intrinsics AC_MSG_CHECKING(compiler support for atomics) AC_LINK_IFELSE([AC_LANG_PROGRAM([[int i = 0;]], diff --git a/librdmacm/examples/cmtime.c b/librdmacm/examples/cmtime.c index 6761f2ccda3..ebc660ba56a 100644 --- a/librdmacm/examples/cmtime.c +++ b/librdmacm/examples/cmtime.c @@ -84,10 +84,27 @@ struct node { int retries; }; +struct list_head { + struct list_head *prev; + struct list_head *next; + struct rdma_cm_id *id; +}; + +struct work_list { + pthread_mutex_t lock; + pthread_cond_t cond; + struct list_head list; +}; + +#define INIT_LIST(x) ((x)->prev = (x)->next = (x)) + +static struct work_list req_work; +static struct work_list disc_work; static struct node *nodes; static struct timeval times[STEP_CNT][2]; static int connections = 100; -static int left[STEP_CNT]; +static volatile int started[STEP_CNT]; +static volatile int completed[STEP_CNT]; static struct ibv_qp_init_attr init_qp_attr; static struct rdma_conn_param conn_param; @@ -96,6 +113,59 @@ static struct rdma_conn_param conn_param; #define start_time(s) gettimeofday(×[s][0], NULL) #define end_time(s) gettimeofday(×[s][1], NULL) +static inline void __list_delete(struct list_head *list) +{ + struct list_head *prev, *next; + prev = list->prev; + next = list->next; + prev->next = next; + next->prev = prev; + INIT_LIST(list); +} + +static inline int __list_empty(struct work_list *list) +{ + return list->list.next == &list->list; +} + +static inline int list_empty(struct work_list *work_list) +{ + pthread_mutex_lock(&work_list->lock); + return work_list->list.next == &work_list->list; + pthread_mutex_unlock(&work_list->lock); +} + +static inline struct list_head *__list_remove_head(struct work_list *work_list) +{ + struct list_head *list_item; + + list_item = work_list->list.next; + __list_delete(list_item); + return list_item; +} + +static inline struct list_head *list_remove_head(struct work_list *work_list) +{ + struct list_head *list_item; + pthread_mutex_lock(&work_list->lock); + list_item = __list_remove_head(work_list); + pthread_mutex_unlock(&work_list->lock); + return list_item; +} + +static inline void list_add_tail(struct work_list *work_list, struct list_head *req) +{ + int empty; + pthread_mutex_lock(&work_list->lock); + empty = __list_empty(work_list); + req->prev = work_list->list.prev; + req->next = &work_list->list; + req->prev->next = work_list->list.prev = req; + pthread_mutex_unlock(&work_list->lock); + if (empty) + pthread_cond_signal(&work_list->cond); +} + static int zero_time(struct timeval *t) { return !(t->tv_sec || t->tv_usec); @@ -140,28 +210,28 @@ static void show_perf(void) static void addr_handler(struct node *n) { end_perf(n, STEP_RESOLVE_ADDR); - left[STEP_RESOLVE_ADDR]--; + completed[STEP_RESOLVE_ADDR]++; } static void route_handler(struct node *n) { end_perf(n, STEP_RESOLVE_ROUTE); - left[STEP_RESOLVE_ROUTE]--; + completed[STEP_RESOLVE_ROUTE]++; } static void conn_handler(struct node *n) { end_perf(n, STEP_CONNECT); - left[STEP_CONNECT]--; + completed[STEP_CONNECT]++; } static void disc_handler(struct node *n) { end_perf(n, STEP_DISCONNECT); - left[STEP_DISCONNECT]--; + completed[STEP_DISCONNECT]++; } -static int req_handler(struct rdma_cm_id *id) +static void __req_handler(struct rdma_cm_id *id) { int ret; @@ -176,17 +246,50 @@ static int req_handler(struct rdma_cm_id *id) perror("failure accepting"); goto err; } - return 0; + return; err: printf("failing connection request\n"); rdma_reject(id, NULL, 0); - return ret; + rdma_destroy_id(id); + return; +} + +static void *req_handler_thread(void *arg) +{ + struct list_head *work; + do { + pthread_mutex_lock(&req_work.lock); + if (__list_empty(&req_work)) + pthread_cond_wait(&req_work.cond, &req_work.lock); + work = __list_remove_head(&req_work); + pthread_mutex_unlock(&req_work.lock); + __req_handler(work->id); + free(work); + } while (1); + return NULL; +} + +static void *disc_handler_thread(void *arg) +{ + struct list_head *work; + do { + pthread_mutex_lock(&disc_work.lock); + if (__list_empty(&disc_work)) + pthread_cond_wait(&disc_work.cond, &disc_work.lock); + work = __list_remove_head(&disc_work); + pthread_mutex_unlock(&disc_work.lock); + rdma_disconnect(work->id); + rdma_destroy_id(work->id); + free(work); + } while (1); + return NULL; } static void cma_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct node *n = id->context; + struct list_head *request; switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: @@ -196,10 +299,15 @@ static void cma_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) route_handler(n); break; case RDMA_CM_EVENT_CONNECT_REQUEST: - if (req_handler(id)) { - rdma_ack_cm_event(event); + request = malloc(sizeof *request); + if (!request) { + perror("out of memory accepting connect request"); + rdma_reject(id, NULL, 0); rdma_destroy_id(id); - return; + } else { + INIT_LIST(request); + request->id = id; + list_add_tail(&req_work, request); } break; case RDMA_CM_EVENT_ESTABLISHED: @@ -235,12 +343,18 @@ static void cma_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) break; case RDMA_CM_EVENT_DISCONNECTED: if (!n) { - rdma_disconnect(id); - rdma_ack_cm_event(event); - rdma_destroy_id(id); - return; - } - disc_handler(n); + request = malloc(sizeof *request); + if (!request) { + perror("out of memory queueing disconnect request, handling synchronously"); + rdma_disconnect(id); + rdma_destroy_id(id); + } else { + INIT_LIST(request); + request->id = id; + list_add_tail(&disc_work, request); + } + } else + disc_handler(n); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: /* Cleanup will occur after test completes. */ @@ -296,29 +410,67 @@ static void cleanup_nodes(void) end_time(STEP_DESTROY); } -static int process_events(int *left) +static void *process_events(void *arg) { struct rdma_cm_event *event; int ret = 0; - while ((!left || *left) && !ret) { + while (!ret) { ret = rdma_get_cm_event(channel, &event); if (!ret) { cma_handler(event->id, event); } else { - perror("failure in rdma_get_cm_event in connect events"); + perror("failure in rdma_get_cm_event in process_server_events"); ret = errno; } } - - return ret; + return NULL; } static int run_server(void) { + pthread_t req_thread, disc_thread; struct rdma_cm_id *listen_id; int ret; + INIT_LIST(&req_work.list); + INIT_LIST(&disc_work.list); + ret = pthread_mutex_init(&req_work.lock, NULL); + if (ret) { + perror("initializing mutex for req work"); + return ret; + } + + ret = pthread_mutex_init(&disc_work.lock, NULL); + if (ret) { + perror("initializing mutex for disc work"); + return ret; + } + + ret = pthread_cond_init(&req_work.cond, NULL); + if (ret) { + perror("initializing cond for req work"); + return ret; + } + + ret = pthread_cond_init(&disc_work.cond, NULL); + if (ret) { + perror("initializing cond for disc work"); + return ret; + } + + ret = pthread_create(&req_thread, NULL, req_handler_thread, NULL); + if (ret) { + perror("failed to create req handler thread"); + return ret; + } + + ret = pthread_create(&disc_thread, NULL, disc_handler_thread, NULL); + if (ret) { + perror("failed to create disconnect handler thread"); + return ret; + } + ret = rdma_create_id(channel, &listen_id, NULL, hints.ai_port_space); if (ret) { perror("listen request failed"); @@ -351,6 +503,7 @@ static int run_server(void) static int run_client(void) { + pthread_t event_thread; int i, ret; ret = get_rdma_addr(src_addr, dst_addr, port, &hints, &rai); @@ -365,6 +518,12 @@ static int run_client(void) conn_param.private_data = rai->ai_connect; conn_param.private_data_len = rai->ai_connect_len; + ret = pthread_create(&event_thread, NULL, process_events, NULL); + if (ret) { + perror("failure creating event thread"); + return ret; + } + if (src_addr) { printf("binding source address\n"); start_time(STEP_BIND); @@ -395,11 +554,9 @@ static int run_client(void) nodes[i].error = 1; continue; } - left[STEP_RESOLVE_ADDR]++; + started[STEP_RESOLVE_ADDR]++; } - ret = process_events(&left[STEP_RESOLVE_ADDR]); - if (ret) - return ret; + while (started[STEP_RESOLVE_ADDR] != completed[STEP_RESOLVE_ADDR]) sched_yield(); end_time(STEP_RESOLVE_ADDR); printf("resolving route\n"); @@ -415,11 +572,9 @@ static int run_client(void) nodes[i].error = 1; continue; } - left[STEP_RESOLVE_ROUTE]++; + started[STEP_RESOLVE_ROUTE]++; } - ret = process_events(&left[STEP_RESOLVE_ROUTE]); - if (ret) - return ret; + while (started[STEP_RESOLVE_ROUTE] != completed[STEP_RESOLVE_ROUTE]) sched_yield(); end_time(STEP_RESOLVE_ROUTE); printf("creating qp\n"); @@ -450,11 +605,9 @@ static int run_client(void) nodes[i].error = 1; continue; } - left[STEP_CONNECT]++; + started[STEP_CONNECT]++; } - ret = process_events(&left[STEP_CONNECT]); - if (ret) - return ret; + while (started[STEP_CONNECT] != completed[STEP_CONNECT]) sched_yield(); end_time(STEP_CONNECT); printf("disconnecting\n"); @@ -464,11 +617,9 @@ static int run_client(void) continue; start_perf(&nodes[i], STEP_DISCONNECT); rdma_disconnect(nodes[i].id); - left[STEP_DISCONNECT]++; + started[STEP_DISCONNECT]++; } - ret = process_events(&left[STEP_DISCONNECT]); - if (ret) - return ret; + while (started[STEP_DISCONNECT] != completed[STEP_DISCONNECT]) sched_yield(); end_time(STEP_DISCONNECT); return ret; diff --git a/librdmacm/examples/rdma_client.c b/librdmacm/examples/rdma_client.c index 7a59d976ce8..f676b7023f0 100644 --- a/librdmacm/examples/rdma_client.c +++ b/librdmacm/examples/rdma_client.c @@ -39,7 +39,8 @@ static char *server = "127.0.0.1"; static char *port = "7471"; struct rdma_cm_id *id; -struct ibv_mr *mr; +struct ibv_mr *mr, *send_mr; +int send_flags; uint8_t send_msg[16]; uint8_t recv_msg[16]; @@ -54,8 +55,8 @@ static int run(void) hints.ai_port_space = RDMA_PS_TCP; ret = rdma_getaddrinfo(server, port, &hints, &res); if (ret) { - printf("rdma_getaddrinfo %d\n", errno); - return ret; + perror("rdma_getaddrinfo"); + goto out; } memset(&attr, 0, sizeof attr); @@ -65,46 +66,76 @@ static int run(void) attr.qp_context = id; attr.sq_sig_all = 1; ret = rdma_create_ep(&id, res, NULL, &attr); - rdma_freeaddrinfo(res); + // Check to see if we got inline data allowed or not + if (attr.cap.max_inline_data >= 16) + send_flags = IBV_SEND_INLINE; + else + printf("rdma_client: device doesn't support IBV_SEND_INLINE, " + "using sge sends\n"); + if (ret) { - printf("rdma_create_ep %d\n", errno); - return ret; + perror("rdma_create_ep"); + goto out_free_addrinfo; } mr = rdma_reg_msgs(id, recv_msg, 16); if (!mr) { - printf("rdma_reg_msgs %d\n", errno); - return ret; + perror("rdma_reg_msgs for recv_msg"); + ret = -1; + goto out_destroy_ep; + } + if ((send_flags & IBV_SEND_INLINE) == 0) { + send_mr = rdma_reg_msgs(id, send_msg, 16); + if (!send_mr) { + perror("rdma_reg_msgs for send_msg"); + ret = -1; + goto out_dereg_recv; + } } ret = rdma_post_recv(id, NULL, recv_msg, 16, mr); if (ret) { - printf("rdma_post_recv %d\n", errno); - return ret; + perror("rdma_post_recv"); + goto out_dereg_send; } ret = rdma_connect(id, NULL); if (ret) { - printf("rdma_connect %d\n", errno); - return ret; + perror("rdma_connect"); + goto out_dereg_send; } - ret = rdma_post_send(id, NULL, send_msg, 16, NULL, IBV_SEND_INLINE); + ret = rdma_post_send(id, NULL, send_msg, 16, send_mr, send_flags); if (ret) { - printf("rdma_post_send %d\n", errno); - return ret; + perror("rdma_post_send"); + goto out_disconnect; } - ret = rdma_get_recv_comp(id, &wc); - if (ret <= 0) { - printf("rdma_get_recv_comp %d\n", ret); - return ret; + while ((ret = rdma_get_send_comp(id, &wc)) == 0); + if (ret < 0) { + perror("rdma_get_send_comp"); + goto out_disconnect; } + while ((ret = rdma_get_recv_comp(id, &wc)) == 0); + if (ret < 0) + perror("rdma_get_recv_comp"); + else + ret = 0; + +out_disconnect: rdma_disconnect(id); +out_dereg_send: + if ((send_flags & IBV_SEND_INLINE) == 0) + rdma_dereg_mr(send_mr); +out_dereg_recv: rdma_dereg_mr(mr); +out_destroy_ep: rdma_destroy_ep(id); - return 0; +out_free_addrinfo: + rdma_freeaddrinfo(res); +out: + return ret; } int main(int argc, char **argv) diff --git a/librdmacm/examples/rdma_server.c b/librdmacm/examples/rdma_server.c index 5b9e16d5f65..129cf428f94 100644 --- a/librdmacm/examples/rdma_server.c +++ b/librdmacm/examples/rdma_server.c @@ -39,14 +39,16 @@ static char *port = "7471"; struct rdma_cm_id *listen_id, *id; -struct ibv_mr *mr; +struct ibv_mr *mr, *send_mr; +int send_flags; uint8_t send_msg[16]; uint8_t recv_msg[16]; static int run(void) { struct rdma_addrinfo hints, *res; - struct ibv_qp_init_attr attr; + struct ibv_qp_init_attr init_attr; + struct ibv_qp_attr qp_attr; struct ibv_wc wc; int ret; @@ -55,75 +57,106 @@ static int run(void) hints.ai_port_space = RDMA_PS_TCP; ret = rdma_getaddrinfo(NULL, port, &hints, &res); if (ret) { - printf("rdma_getaddrinfo %d\n", errno); + perror("rdma_getaddrinfo"); return ret; } - memset(&attr, 0, sizeof attr); - attr.cap.max_send_wr = attr.cap.max_recv_wr = 1; - attr.cap.max_send_sge = attr.cap.max_recv_sge = 1; - attr.cap.max_inline_data = 16; - attr.sq_sig_all = 1; - ret = rdma_create_ep(&listen_id, res, NULL, &attr); - rdma_freeaddrinfo(res); + memset(&init_attr, 0, sizeof init_attr); + init_attr.cap.max_send_wr = init_attr.cap.max_recv_wr = 1; + init_attr.cap.max_send_sge = init_attr.cap.max_recv_sge = 1; + init_attr.cap.max_inline_data = 16; + init_attr.sq_sig_all = 1; + ret = rdma_create_ep(&listen_id, res, NULL, &init_attr); if (ret) { - printf("rdma_create_ep %d\n", errno); - return ret; + perror("rdma_create_ep"); + goto out_free_addrinfo; } ret = rdma_listen(listen_id, 0); if (ret) { - printf("rdma_listen %d\n", errno); - return ret; + perror("rdma_listen"); + goto out_destroy_listen_ep; } ret = rdma_get_request(listen_id, &id); if (ret) { - printf("rdma_get_request %d\n", errno); - return ret; + perror("rdma_get_request"); + goto out_destroy_listen_ep; } + memset(&qp_attr, 0, sizeof qp_attr); + memset(&init_attr, 0, sizeof init_attr); + ret = ibv_query_qp(id->qp, &qp_attr, IBV_QP_CAP, + &init_attr); + if (ret) { + perror("ibv_query_qp"); + goto out_destroy_accept_ep; + } + if (init_attr.cap.max_inline_data >= 16) + send_flags = IBV_SEND_INLINE; + else + printf("rdma_server: device doesn't support IBV_SEND_INLINE, " + "using sge sends\n"); + mr = rdma_reg_msgs(id, recv_msg, 16); if (!mr) { - printf("rdma_reg_msgs %d\n", errno); - return ret; + ret = -1; + perror("rdma_reg_msgs for recv_msg"); + goto out_destroy_accept_ep; + } + if ((send_flags & IBV_SEND_INLINE) == 0) { + send_mr = rdma_reg_msgs(id, send_msg, 16); + if (!send_mr) { + ret = -1; + perror("rdma_reg_msgs for send_msg"); + goto out_dereg_recv; + } } ret = rdma_post_recv(id, NULL, recv_msg, 16, mr); if (ret) { - printf("rdma_post_recv %d\n", errno); - return ret; + perror("rdma_post_recv"); + goto out_dereg_send; } ret = rdma_accept(id, NULL); if (ret) { - printf("rdma_accept %d\n", errno); - return ret; + perror("rdma_accept"); + goto out_dereg_send; } - ret = rdma_get_recv_comp(id, &wc); - if (ret <= 0) { - printf("rdma_get_recv_comp %d\n", ret); - return ret; + while ((ret = rdma_get_recv_comp(id, &wc)) == 0); + if (ret < 0) { + perror("rdma_get_recv_comp"); + goto out_disconnect; } - ret = rdma_post_send(id, NULL, send_msg, 16, NULL, IBV_SEND_INLINE); + ret = rdma_post_send(id, NULL, send_msg, 16, send_mr, send_flags); if (ret) { - printf("rdma_post_send %d\n", errno); - return ret; + perror("rdma_post_send"); + goto out_disconnect; } - ret = rdma_get_send_comp(id, &wc); - if (ret <= 0) { - printf("rdma_get_send_comp %d\n", ret); - return ret; - } + while ((ret = rdma_get_send_comp(id, &wc)) == 0); + if (ret < 0) + perror("rdma_get_send_comp"); + else + ret = 0; +out_disconnect: rdma_disconnect(id); +out_dereg_send: + if ((send_flags & IBV_SEND_INLINE) == 0) + rdma_dereg_mr(send_mr); +out_dereg_recv: rdma_dereg_mr(mr); +out_destroy_accept_ep: rdma_destroy_ep(id); +out_destroy_listen_ep: rdma_destroy_ep(listen_id); - return 0; +out_free_addrinfo: + rdma_freeaddrinfo(res); + return ret; } int main(int argc, char **argv) diff --git a/librdmacm/examples/rdma_xclient.c b/librdmacm/examples/rdma_xclient.c index e19229040d7..65104083a8d 100644 --- a/librdmacm/examples/rdma_xclient.c +++ b/librdmacm/examples/rdma_xclient.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2011 Intel Corporation. All rights reserved. + * Copyright (c) 2010-2014 Intel Corporation. All rights reserved. * * This software is available to you under the OpenIB.org BSD license * below: @@ -39,111 +39,19 @@ static char *server = "127.0.0.1"; static char port[6] = "7471"; -static int (*run_func)() = NULL; struct rdma_cm_id *id; struct ibv_mr *mr; -enum ibv_qp_type qpt = IBV_QPT_RC; +struct rdma_addrinfo hints; -#define MSG_SIZE 16 -uint8_t send_msg[MSG_SIZE]; -uint8_t recv_msg[MSG_SIZE]; - -#ifdef IBV_XRC_OPS -#define PRINT_XRC_OPT printf("\t x - XRC: extended-reliable-connected\n") +uint8_t send_msg[16]; uint32_t srqn; -/* - * Connect XRC SEND QP. - */ -static int xrc_connect_send(void) -{ - struct rdma_addrinfo hints, *res; - struct ibv_qp_init_attr attr; - int ret; - - memset(&hints, 0, sizeof hints); - hints.ai_port_space = RDMA_PS_IB; - hints.ai_qp_type = IBV_QPT_XRC_SEND; - ret = rdma_getaddrinfo(server, port, &hints, &res); - if (ret) { - printf("rdma_getaddrinfo connect send %d\n", errno); - return ret; - } - - memset(&attr, 0, sizeof attr); - attr.cap.max_send_wr = 1; - attr.cap.max_send_sge = 1; - attr.cap.max_inline_data = sizeof send_msg; - attr.qp_context = id; - attr.sq_sig_all = 1; - ret = rdma_create_ep(&id, res, NULL, &attr); - rdma_freeaddrinfo(res); - if (ret) { - printf("rdma_create_ep send qp %d\n", errno); - return ret; - } - - ret = rdma_connect(id, NULL); - if (ret) { - printf("rdma_connect send qp %d\n", errno); - return ret; - } - - return 0; -} - -/* - * Resolve remote SRQ number - */ -static int xrc_resolve_srqn(void) -{ - struct rdma_addrinfo hints, *res; - struct rdma_cm_id *id; - int ret; - - memset(&hints, 0, sizeof hints); - hints.ai_qp_type = IBV_QPT_UD; /* for now */ - hints.ai_port_space = RDMA_PS_IB; - sprintf(port, "%d", atoi(port) + 1); - ret = rdma_getaddrinfo(server, port, &hints, &res); - if (ret) { - printf("rdma_getaddrinfo resolve srqn %d\n", errno); - return ret; - } - - ret = rdma_create_ep(&id, res, NULL, NULL); - rdma_freeaddrinfo(res); - if (ret) { - printf("rdma_create_ep for srqn %d\n", errno); - return ret; - } - - ret = rdma_connect(id, NULL); - if (ret) { - printf("rdma_connect for srqn %d\n", errno); - return ret; - } - - srqn = id->event->param.ud.qp_num; - rdma_destroy_ep(id); - return 0; -} - -static int xrc_test(void) +static int post_send(void) { struct ibv_send_wr wr, *bad; struct ibv_sge sge; - struct ibv_wc wc; int ret; - ret = xrc_connect_send(); - if (ret) - return ret; - - ret = xrc_resolve_srqn(); - if (ret) - return ret; - sge.addr = (uint64_t) (uintptr_t) send_msg; sge.length = (uint32_t) sizeof send_msg; sge.lkey = 0; @@ -153,92 +61,64 @@ static int xrc_test(void) wr.num_sge = 1; wr.opcode = IBV_WR_SEND; wr.send_flags = IBV_SEND_INLINE; - wr.wr.xrc.remote_srqn = srqn; + if (hints.ai_qp_type == IBV_QPT_XRC_SEND) + wr.qp_type.xrc.remote_srqn = srqn; ret = ibv_post_send(id->qp, &wr, &bad); - if (ret) { - printf("rdma_post_send %d\n", errno); - return ret; - } - - ret = rdma_get_send_comp(id, &wc); - if (ret <= 0) { - printf("rdma_get_recv_comp %d\n", ret); - return ret; - } - - rdma_disconnect(id); - rdma_destroy_ep(id); - return 0; -} + if (ret) + perror("rdma_post_send"); -static inline int set_xrc_qpt(void) -{ - qpt = IBV_QPT_XRC_SEND; - run_func = xrc_test; - return 0; + return ret; } -#else -#define PRINT_XRC_OPT -#define set_xrc_qpt() -1 -#endif /* IBV_XRC_OPS */ - -static int rc_test(void) +static int test(void) { - struct rdma_addrinfo hints, *res; + struct rdma_addrinfo *res; struct ibv_qp_init_attr attr; struct ibv_wc wc; int ret; - memset(&hints, 0, sizeof hints); - hints.ai_port_space = RDMA_PS_TCP; ret = rdma_getaddrinfo(server, port, &hints, &res); if (ret) { - printf("rdma_getaddrinfo %d\n", errno); + perror("rdma_getaddrinfo"); return ret; } memset(&attr, 0, sizeof attr); attr.cap.max_send_wr = attr.cap.max_recv_wr = 1; attr.cap.max_send_sge = attr.cap.max_recv_sge = 1; - attr.cap.max_inline_data = sizeof send_msg; - attr.qp_context = id; attr.sq_sig_all = 1; ret = rdma_create_ep(&id, res, NULL, &attr); rdma_freeaddrinfo(res); if (ret) { - printf("rdma_create_ep %d\n", errno); + perror("rdma_create_ep"); return ret; } - mr = rdma_reg_msgs(id, recv_msg, sizeof recv_msg); + mr = rdma_reg_msgs(id, send_msg, sizeof send_msg); if (!mr) { - printf("rdma_reg_msgs %d\n", errno); - return ret; - } - - ret = rdma_post_recv(id, NULL, recv_msg, sizeof recv_msg, mr); - if (ret) { - printf("rdma_post_recv %d\n", errno); + perror("rdma_reg_msgs"); return ret; } ret = rdma_connect(id, NULL); if (ret) { - printf("rdma_connect %d\n", errno); + perror("rdma_connect"); return ret; } - ret = rdma_post_send(id, NULL, send_msg, sizeof send_msg, NULL, IBV_SEND_INLINE); + if (hints.ai_qp_type == IBV_QPT_XRC_SEND) + srqn = ntohl(*(uint32_t *) id->event->param.conn.private_data); + + ret = post_send(); if (ret) { - printf("rdma_post_send %d\n", errno); + perror("post_send"); return ret; } - ret = rdma_get_recv_comp(id, &wc); + ret = rdma_get_send_comp(id, &wc); if (ret <= 0) { - printf("rdma_get_recv_comp %d\n", ret); + perror("rdma_get_recv_comp"); return ret; } @@ -248,22 +128,13 @@ static int rc_test(void) return 0; } -static int set_qpt(char type) -{ - if (type == 'r') { - qpt = IBV_QPT_RC; - return 0; - } else if (type == 'x') { - return set_xrc_qpt(); - } - return -1; -} - int main(int argc, char **argv) { int op, ret; - run_func = rc_test; + hints.ai_port_space = RDMA_PS_TCP; + hints.ai_qp_type = IBV_QPT_RC; + while ((op = getopt(argc, argv, "s:p:c:")) != -1) { switch (op) { case 's': @@ -273,8 +144,16 @@ int main(int argc, char **argv) strncpy(port, optarg, sizeof port - 1); break; case 'c': - if (set_qpt(tolower(optarg[0]))) + switch (tolower(optarg[0])) { + case 'r': + break; + case 'x': + hints.ai_port_space = RDMA_PS_IB; + hints.ai_qp_type = IBV_QPT_XRC_SEND; + break; + default: goto err; + } break; default: goto err; @@ -282,7 +161,7 @@ int main(int argc, char **argv) } printf("%s: start\n", argv[0]); - ret = run_func(); + ret = test(); printf("%s: end %d\n", argv[0], ret); return ret; @@ -292,6 +171,6 @@ int main(int argc, char **argv) printf("\t[-p port_number]\n"); printf("\t[-c communication type]\n"); printf("\t r - RC: reliable-connected (default)\n"); - PRINT_XRC_OPT; + printf("\t x - XRC: extended-reliable-connected\n"); exit(1); } diff --git a/librdmacm/examples/rdma_xserver.c b/librdmacm/examples/rdma_xserver.c index df3e6653e65..d30c88e6008 100644 --- a/librdmacm/examples/rdma_xserver.c +++ b/librdmacm/examples/rdma_xserver.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005-2011 Intel Corporation. All rights reserved. + * Copyright (c) 2005-2014 Intel Corporation. All rights reserved. * * This software is available to you under the OpenIB.org BSD license * below: @@ -40,287 +40,100 @@ static char *port = "7471"; -static int (*run_func)(); struct rdma_cm_id *listen_id, *id; struct ibv_mr *mr; -enum ibv_qp_type qpt = IBV_QPT_RC; +struct rdma_addrinfo hints; -#define MSG_SIZE 16 -uint8_t send_msg[MSG_SIZE]; -uint8_t recv_msg[MSG_SIZE]; +uint8_t recv_msg[16]; +uint32_t srqn; - -#ifdef IBV_XRC_OPS -#define PRINT_XRC_OPT printf("\t x - XRC: extended-reliable-connected\n") -struct rdma_cm_id *srq_id; - -/* - * Listen for XRC RECV QP connection request. - */ -static struct rdma_cm_id * xrc_listen_recv(void) -{ - struct rdma_addrinfo hints, *res; - struct rdma_cm_id *id; - int ret; - - memset(&hints, 0, sizeof hints); - hints.ai_flags = RAI_PASSIVE; - hints.ai_port_space = RDMA_PS_IB; - hints.ai_qp_type = IBV_QPT_XRC_RECV; - ret = rdma_getaddrinfo(NULL, port, &hints, &res); - if (ret) { - printf("rdma_getaddrinfo listen recv %d\n", errno); - return NULL; - } - - ret = rdma_create_ep(&listen_id, res, NULL, NULL); - rdma_freeaddrinfo(res); - if (ret) { - printf("rdma_create_ep listen recv %d\n", errno); - return NULL; - } - - ret = rdma_listen(listen_id, 0); - if (ret) { - printf("rdma_listen %d\n", errno); - return NULL; - } - - ret = rdma_get_request(listen_id, &id); - if (ret) { - printf("rdma_get_request %d\n", errno); - return NULL; - } - - return id; -} - -/* - * Create SRQ and listen for XRC SRQN lookup request. - */ -static int xrc_create_srq_listen(struct sockaddr *addr, socklen_t addr_len) +static int create_srq(void) { - struct rdma_addrinfo rai; - struct sockaddr_storage ss; struct ibv_srq_init_attr attr; int ret; - memset(&rai, 0, sizeof rai); - rai.ai_flags = RAI_PASSIVE; - rai.ai_family = addr->sa_family; - rai.ai_qp_type = IBV_QPT_UD; /* for now */ - rai.ai_port_space = RDMA_PS_IB; - memcpy(&ss, addr, addr_len); - rai.ai_src_len = addr_len; - rai.ai_src_addr = (struct sockaddr *) &ss; - ((struct sockaddr_in *) &ss)->sin_port = htons((short) atoi(port) + 1); - - ret = rdma_create_ep(&srq_id, &rai, NULL, NULL); - if (ret) { - printf("rdma_create_ep srq ep %d\n", errno); - return ret; - } - - if (!srq_id->verbs) { - printf("rdma_create_ep failed to bind to device.\n"); - printf("XRC tests cannot use loopback addressing\n"); - return -1; - } - - memset(&attr, 0, sizeof attr); attr.attr.max_wr = 1; attr.attr.max_sge = 1; - attr.srq_type = IBV_SRQT_XRC; - - attr.ext.xrc.xrcd = ibv_open_xrcd(srq_id->verbs, -1, 0); - if (!attr.ext.xrc.xrcd) { - printf("Unable to open xrcd\n"); - return -1; - } - - ret = rdma_create_srq(srq_id, NULL, &attr); - if (ret) { - printf("Unable to create srq %d\n", errno); - return ret; - } - - ret = rdma_listen(srq_id, 0); - if (ret) { - printf("rdma_listen srq id %d\n", errno); - return ret; - } - - return 0; -} + attr.attr.srq_limit = 0; + attr.srq_context = id; -static int xrc_test(void) -{ - struct rdma_cm_id *conn_id, *lookup_id; - struct ibv_qp_init_attr attr; - struct rdma_conn_param param; - struct rdma_cm_event *event; - struct ibv_wc wc; - int ret; - - conn_id = xrc_listen_recv(); - if (!conn_id) - return -1; - - ret = xrc_create_srq_listen(rdma_get_local_addr(conn_id), - sizeof(struct sockaddr_storage)); + ret = rdma_create_srq(id, NULL, &attr); if (ret) - return -1; + perror("rdma_create_srq:"); - memset(&attr, 0, sizeof attr); - attr.qp_type = IBV_QPT_XRC_RECV; - attr.ext.xrc_recv.xrcd = srq_id->srq->ext.xrc.xrcd; - ret = rdma_create_qp(conn_id, NULL, &attr); - if (ret) { - printf("Unable to create xrc recv qp %d\n", errno); - return ret; + if (id->srq) { + ibv_get_srq_num(id->srq, &srqn); + srqn = htonl(srqn); } - - ret = rdma_accept(conn_id, NULL); - if (ret) { - printf("rdma_accept failed for xrc recv qp %d\n", errno); - return ret; - } - - ret = rdma_get_request(srq_id, &lookup_id); - if (ret) { - printf("rdma_get_request %d\n", errno); - return ret; - } - - mr = rdma_reg_msgs(srq_id, recv_msg, sizeof recv_msg); - if (!mr) { - printf("ibv_reg_msgs %d\n", errno); - return ret; - } - - ret = rdma_post_recv(srq_id, NULL, recv_msg, sizeof recv_msg, mr); - if (ret) { - printf("rdma_post_recv %d\n", errno); - return ret; - } - - memset(¶m, 0, sizeof param); - param.qp_num = srq_id->srq->ext.xrc.srq_num; - ret = rdma_accept(lookup_id, ¶m); - if (ret) { - printf("rdma_accept failed for srqn lookup %d\n", errno); - return ret; - } - - rdma_destroy_id(lookup_id); - - ret = rdma_get_recv_comp(srq_id, &wc); - if (ret <= 0) { - printf("rdma_get_recv_comp %d\n", ret); - return ret; - } - - ret = rdma_get_cm_event(conn_id->channel, &event); - if (ret || event->event != RDMA_CM_EVENT_DISCONNECTED) { - printf("Failed to get disconnect event\n"); - return -1; - } - - rdma_ack_cm_event(event); - rdma_disconnect(conn_id); - rdma_destroy_ep(conn_id); - rdma_dereg_mr(mr); - rdma_destroy_ep(srq_id); - rdma_destroy_ep(listen_id); - return 0; -} - -static inline int set_xrc_qpt(void) -{ - qpt = IBV_QPT_XRC_RECV; - run_func = xrc_test; - return 0; + return ret; } -#else -#define PRINT_XRC_OPT -#define set_xrc_qpt() -1 -#endif /* IBV_XRC_OPS */ - - -static int rc_test(void) +static int test(void) { - struct rdma_addrinfo hints, *res; + struct rdma_addrinfo *res; struct ibv_qp_init_attr attr; + struct rdma_conn_param param; struct ibv_wc wc; int ret; - memset(&hints, 0, sizeof hints); - hints.ai_flags = RAI_PASSIVE; - hints.ai_port_space = RDMA_PS_TCP; ret = rdma_getaddrinfo(NULL, port, &hints, &res); if (ret) { - printf("rdma_getaddrinfo %d\n", errno); + perror("rdma_getaddrinfo"); return ret; } memset(&attr, 0, sizeof attr); attr.cap.max_send_wr = attr.cap.max_recv_wr = 1; attr.cap.max_send_sge = attr.cap.max_recv_sge = 1; - attr.cap.max_inline_data = sizeof send_msg; - attr.sq_sig_all = 1; ret = rdma_create_ep(&listen_id, res, NULL, &attr); rdma_freeaddrinfo(res); if (ret) { - printf("rdma_create_ep %d\n", errno); + perror("rdma_create_ep"); return ret; } ret = rdma_listen(listen_id, 0); if (ret) { - printf("rdma_listen %d\n", errno); + perror("rdma_listen"); return ret; } ret = rdma_get_request(listen_id, &id); if (ret) { - printf("rdma_get_request %d\n", errno); + perror("rdma_get_request"); return ret; } + if (hints.ai_qp_type == IBV_QPT_XRC_RECV) { + ret = create_srq(); + if (ret) + return ret; + } + mr = rdma_reg_msgs(id, recv_msg, sizeof recv_msg); if (!mr) { - printf("rdma_reg_msgs %d\n", errno); + perror("rdma_reg_msgs"); return ret; } ret = rdma_post_recv(id, NULL, recv_msg, sizeof recv_msg, mr); if (ret) { - printf("rdma_post_recv %d\n", errno); + perror("rdma_post_recv"); return ret; } - ret = rdma_accept(id, NULL); + memset(¶m, 0, sizeof param); + param.private_data = &srqn; + param.private_data_len = sizeof srqn; + ret = rdma_accept(id, ¶m); if (ret) { - printf("rdma_accept %d\n", errno); + perror("rdma_accept"); return ret; } ret = rdma_get_recv_comp(id, &wc); if (ret <= 0) { - printf("rdma_get_recv_comp %d\n", ret); - return ret; - } - - ret = rdma_post_send(id, NULL, send_msg, sizeof send_msg, NULL, IBV_SEND_INLINE); - if (ret) { - printf("rdma_post_send %d\n", errno); - return ret; - } - - ret = rdma_get_send_comp(id, &wc); - if (ret <= 0) { - printf("rdma_get_send_comp %d\n", ret); + perror("rdma_get_recv_comp"); return ret; } @@ -331,30 +144,30 @@ static int rc_test(void) return 0; } -static int set_qpt(char type) -{ - if (type == 'r') { - qpt = IBV_QPT_RC; - return 0; - } else if (type == 'x') { - return set_xrc_qpt(); - } - return -1; -} - int main(int argc, char **argv) { int op, ret; - run_func = rc_test; + hints.ai_flags = RAI_PASSIVE; + hints.ai_port_space = RDMA_PS_TCP; + hints.ai_qp_type = IBV_QPT_RC; + while ((op = getopt(argc, argv, "p:c:")) != -1) { switch (op) { case 'p': port = optarg; break; case 'c': - if (set_qpt(tolower(optarg[0]))) + switch (tolower(optarg[0])) { + case 'r': + break; + case 'x': + hints.ai_port_space = RDMA_PS_IB; + hints.ai_qp_type = IBV_QPT_XRC_RECV; + break; + default: goto err; + } break; default: goto err; @@ -362,7 +175,7 @@ int main(int argc, char **argv) } printf("%s: start\n", argv[0]); - ret = run_func(); + ret = test(); printf("%s: end %d\n", argv[0], ret); return ret; @@ -371,6 +184,6 @@ int main(int argc, char **argv) printf("\t[-p port_number]\n"); printf("\t[-c communication type]\n"); printf("\t r - RC: reliable-connected (default)\n"); - PRINT_XRC_OPT; + printf("\t x - XRC: extended-reliable-connected\n"); exit(1); } diff --git a/librdmacm/examples/riostream.c b/librdmacm/examples/riostream.c index 4c0d21fb8d9..c12dd0d40f8 100644 --- a/librdmacm/examples/riostream.c +++ b/librdmacm/examples/riostream.c @@ -148,9 +148,6 @@ static int send_msg(int size) struct pollfd fds; int offset, ret; - if (verify) - format_buf(buf, size); - if (use_async) { fds.fd = rs; fds.events = POLLOUT; @@ -180,9 +177,6 @@ static int send_xfer(int size) struct pollfd fds; int offset, ret; - if (verify) - format_buf(buf, size - 1); - if (use_async) { fds.fd = rs; fds.events = POLLOUT; @@ -233,12 +227,6 @@ static int recv_msg(int size) } } - if (verify) { - ret = verify_buf(buf, size); - if (ret) - return ret; - } - return 0; } @@ -296,6 +284,8 @@ static int run_test(void) goto out; } *poll_byte = (uint8_t) marker++; + if (verify) + format_buf(buf, transfer_size - 1); ret = send_xfer(transfer_size); if (ret) goto out; @@ -312,6 +302,8 @@ static int run_test(void) goto out; } *poll_byte = (uint8_t) marker++; + if (verify) + format_buf(buf, transfer_size - 1); ret = send_xfer(transfer_size); } if (ret) diff --git a/librdmacm/examples/rstream.c b/librdmacm/examples/rstream.c index 27326c2ebc7..05598a89755 100644 --- a/librdmacm/examples/rstream.c +++ b/librdmacm/examples/rstream.c @@ -577,7 +577,7 @@ static int set_test_opt(char *optarg) flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL; } else if (!strncasecmp("nonblock", optarg, 8)) { flags |= MSG_DONTWAIT; - } else if (strncasecmp("resolve", optarg, 7)) { + } else if (!strncasecmp("resolve", optarg, 7)) { use_rgai = 1; } else if (!strncasecmp("verify", optarg, 6)) { verify = 1; diff --git a/librdmacm/examples/udpong.c b/librdmacm/examples/udpong.c index af8deb9ee8b..7ec11e85ae5 100644 --- a/librdmacm/examples/udpong.c +++ b/librdmacm/examples/udpong.c @@ -347,7 +347,7 @@ static ssize_t client_recv(struct message *msg, size_t size, int timeout) } ret = rs_recv(rs, msg, size, flags | MSG_DONTWAIT); - if (ret < 0 && (errno == EWOULDBLOCK || errno == EAGAIN)) + if (ret < 0 && errno != EWOULDBLOCK && errno != EAGAIN) perror("rrecv"); return ret; diff --git a/librdmacm/include/rdma/rdma_cma.h b/librdmacm/include/rdma/rdma_cma.h index 4c4a057eaec..4826c0304b0 100644 --- a/librdmacm/include/rdma/rdma_cma.h +++ b/librdmacm/include/rdma/rdma_cma.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. - * Copyright (c) 2005-2012 Intel Corporation. All rights reserved. + * Copyright (c) 2005-2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -384,6 +384,8 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); */ int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); +int rdma_create_qp_ex(struct rdma_cm_id *id, + struct ibv_qp_init_attr_ex *qp_init_attr); /** * rdma_destroy_qp - Deallocate a QP. diff --git a/librdmacm/include/rdma/rdma_verbs.h b/librdmacm/include/rdma/rdma_verbs.h index 198c6a595b1..10049c31ce1 100644 --- a/librdmacm/include/rdma/rdma_verbs.h +++ b/librdmacm/include/rdma/rdma_verbs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2011 Intel Corporation. All rights reserved. + * Copyright (c) 2010-2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -56,6 +56,7 @@ static inline int rdma_seterrno(int ret) */ int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd, struct ibv_srq_init_attr *attr); +int rdma_create_srq_ex(struct rdma_cm_id *id, struct ibv_srq_init_attr_ex *attr); void rdma_destroy_srq(struct rdma_cm_id *id); diff --git a/librdmacm/librdmacm.spec.in b/librdmacm/librdmacm.spec.in index 15e0e31cd12..60ffb3b325e 100644 --- a/librdmacm/librdmacm.spec.in +++ b/librdmacm/librdmacm.spec.in @@ -1,7 +1,7 @@ %define ver @VERSION@ Name: librdmacm -Version: 1.0.18 +Version: 1.0.19-1 Release: 1%{?dist} Summary: Userspace RDMA Connection Manager diff --git a/librdmacm/man/rdma_accept.3 b/librdmacm/man/rdma_accept.3 index 92e1a07db5c..0859fac6ef2 100644 --- a/librdmacm/man/rdma_accept.3 +++ b/librdmacm/man/rdma_accept.3 @@ -1,4 +1,4 @@ -.TH "RDMA_ACCEPT" 3 "2007-10-31" "librdmacm" "Librdmacm Programmer's Manual" librdmacm +.TH "RDMA_ACCEPT" 3 "2014-05-27" "librdmacm" "Librdmacm Programmer's Manual" librdmacm .SH NAME rdma_accept \- Called to accept a connection request. .SH SYNOPSIS @@ -35,6 +35,9 @@ fields when accepting. Users may reference the rdma_conn_param structure in the connection event directly, or can reference their own structure. If the rdma_conn_param structure from an event is referenced, the event must not be acked until after this call returns. +.P +If the conn_param parameter is NULL, the values reported in the connection +request event are used, adjusted down based on local hardware restrictions. .IP private_data References a user-controlled data buffer. The contents of the buffer are copied and transparently passed to the remote side as part of the diff --git a/librdmacm/src/addrinfo.c b/librdmacm/src/addrinfo.c index 68eaddd3497..cdeb6638120 100644 --- a/librdmacm/src/addrinfo.c +++ b/librdmacm/src/addrinfo.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 Intel Corporation. All rights reserved. + * Copyright (c) 2010-2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -45,14 +45,6 @@ #include #include -#ifdef IBV_XRC_OPS -#define RDMA_QPT_XRC_SEND IBV_QPT_XRC_SEND -#define RDMA_QPT_XRC_RECV IBV_QPT_XRC_RECV -#else -#define RDMA_QPT_XRC_SEND 9 -#define RDMA_QPT_XRC_RECV 10 -#endif - struct rdma_addrinfo nohints; static void ucma_convert_to_ai(struct addrinfo *ai, struct rdma_addrinfo *rai) @@ -68,8 +60,8 @@ static void ucma_convert_to_ai(struct addrinfo *ai, struct rdma_addrinfo *rai) switch (rai->ai_qp_type) { case IBV_QPT_RC: case IBV_QPT_UC: - case RDMA_QPT_XRC_SEND: - case RDMA_QPT_XRC_RECV: + case IBV_QPT_XRC_SEND: + case IBV_QPT_XRC_RECV: ai->ai_socktype = SOCK_STREAM; break; case IBV_QPT_UD: diff --git a/librdmacm/src/cma.c b/librdmacm/src/cma.c index 0dc229e2eeb..749140ecbfd 100644 --- a/librdmacm/src/cma.c +++ b/librdmacm/src/cma.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005-2012 Intel Corporation. All rights reserved. + * Copyright (c) 2005-2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -76,6 +76,7 @@ do { \ struct cma_device { struct ibv_context *verbs; struct ibv_pd *pd; + struct ibv_xrcd *xrcd; uint64_t guid; int port_cnt; int refcnt; @@ -321,7 +322,7 @@ static int ucma_init_device(struct cma_device *cma_dev) return ret; } -int ucma_init_all(void) +static int ucma_init_all(void) { int i, ret = 0; @@ -416,10 +417,10 @@ static int ucma_get_device(struct cma_id_private *id_priv, uint64_t guid) return ERR(ENODEV); match: + pthread_mutex_lock(&mut); if ((ret = ucma_init_device(cma_dev))) - return ret; + goto out; - pthread_mutex_lock(&mut); if (!cma_dev->refcnt++) { cma_dev->pd = ibv_alloc_pd(cma_dev->verbs); if (!cma_dev->pd) { @@ -439,11 +440,30 @@ static int ucma_get_device(struct cma_id_private *id_priv, uint64_t guid) static void ucma_put_device(struct cma_device *cma_dev) { pthread_mutex_lock(&mut); - if (!--cma_dev->refcnt) + if (!--cma_dev->refcnt) { ibv_dealloc_pd(cma_dev->pd); + if (cma_dev->xrcd) + ibv_close_xrcd(cma_dev->xrcd); + } pthread_mutex_unlock(&mut); } +static struct ibv_xrcd *ucma_get_xrcd(struct cma_device *cma_dev) +{ + struct ibv_xrcd_init_attr attr; + + pthread_mutex_lock(&mut); + if (!cma_dev->xrcd) { + memset(&attr, 0, sizeof attr); + attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS; + attr.fd = -1; + attr.oflags = O_CREAT; + cma_dev->xrcd = ibv_open_xrcd(cma_dev->verbs, &attr); + } + pthread_mutex_unlock(&mut); + return cma_dev->xrcd; +} + static void ucma_insert_id(struct cma_id_private *id_priv) { fastlock_acquire(&idm_lock); @@ -701,11 +721,11 @@ static void ucma_convert_path(struct ibv_path_data *path_data, sa_path->numb_path = 1; sa_path->pkey = path_data->path.pkey; sa_path->sl = ntohs(path_data->path.qosclass_sl) & 0xF; - sa_path->mtu_selector = 1; + sa_path->mtu_selector = 2; /* exactly */ sa_path->mtu = path_data->path.mtu & 0x1F; - sa_path->rate_selector = 1; + sa_path->rate_selector = 2; sa_path->rate = path_data->path.rate & 0x1F; - sa_path->packet_life_time_selector = 1; + sa_path->packet_life_time_selector = 2; sa_path->packet_life_time = path_data->path.packetlifetime & 0x1F; sa_path->preference = (uint8_t) path_data->flags; @@ -720,9 +740,6 @@ static int ucma_query_path(struct rdma_cm_id *id) size = sizeof(*resp) + sizeof(struct ibv_path_data) * 6; resp = alloca(size); - if (!resp) - return ERR(ENOMEM); - CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, resp, size); id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; @@ -1193,17 +1210,26 @@ static int ucma_init_ud_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) static void ucma_destroy_cqs(struct rdma_cm_id *id) { - if (id->recv_cq) + if (id->qp_type == IBV_QPT_XRC_RECV && id->srq) + return; + + if (id->recv_cq) { ibv_destroy_cq(id->recv_cq); + if (id->send_cq && (id->send_cq != id->recv_cq)) { + ibv_destroy_cq(id->send_cq); + id->send_cq = NULL; + } + id->recv_cq = NULL; + } - if (id->recv_cq_channel) + if (id->recv_cq_channel) { ibv_destroy_comp_channel(id->recv_cq_channel); - - if (id->send_cq && (id->send_cq != id->recv_cq)) - ibv_destroy_cq(id->send_cq); - - if (id->send_cq_channel && (id->send_cq_channel != id->recv_cq_channel)) - ibv_destroy_comp_channel(id->send_cq_channel); + if (id->send_cq_channel && (id->send_cq_channel != id->recv_cq_channel)) { + ibv_destroy_comp_channel(id->send_cq_channel); + id->send_cq_channel = NULL; + } + id->recv_cq_channel = NULL; + } } static int ucma_create_cqs(struct rdma_cm_id *id, uint32_t send_size, uint32_t recv_size) @@ -1236,36 +1262,44 @@ static int ucma_create_cqs(struct rdma_cm_id *id, uint32_t send_size, uint32_t r return ERR(ENOMEM); } -int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd, - struct ibv_srq_init_attr *attr) +int rdma_create_srq_ex(struct rdma_cm_id *id, struct ibv_srq_init_attr_ex *attr) { + struct cma_id_private *id_priv; struct ibv_srq *srq; int ret; - if (!pd) - pd = id->pd; + id_priv = container_of(id, struct cma_id_private, id); + if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE)) + return ERR(EINVAL); + + if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_PD) || !attr->pd) { + attr->pd = id->pd; + attr->comp_mask |= IBV_SRQ_INIT_ATTR_PD; + } -#ifdef IBV_XRC_OPS if (attr->srq_type == IBV_SRQT_XRC) { - if (!attr->ext.xrc.cq) { + if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) || !attr->xrcd) { + attr->xrcd = ucma_get_xrcd(id_priv->cma_dev); + if (!attr->xrcd) + return -1; + } + if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_CQ) || !attr->cq) { ret = ucma_create_cqs(id, 0, attr->attr.max_wr); if (ret) return ret; - - attr->ext.xrc.cq = id->recv_cq; + attr->cq = id->recv_cq; } + attr->comp_mask |= IBV_SRQ_INIT_ATTR_XRCD | IBV_SRQ_INIT_ATTR_CQ; } - srq = ibv_create_xsrq(pd, attr); -#else - srq = ibv_create_srq(pd, attr); -#endif + srq = ibv_create_srq_ex(id->verbs, attr); if (!srq) { ret = -1; goto err; } - id->pd = pd; + if (!id->pd) + id->pd = attr->pd; id->srq = srq; return 0; err: @@ -1273,16 +1307,34 @@ int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd, return ret; } +int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct ibv_srq_init_attr_ex attr_ex; + int ret; + + memcpy(&attr_ex, attr, sizeof *attr); + attr_ex.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_PD; + if (id->qp_type == IBV_QPT_XRC_RECV) { + attr_ex.srq_type = IBV_SRQT_XRC; + } else { + attr_ex.srq_type = IBV_SRQT_BASIC; + } + attr_ex.pd = pd; + ret = rdma_create_srq_ex(id, &attr_ex); + memcpy(attr, &attr_ex, sizeof *attr); + return ret; +} + void rdma_destroy_srq(struct rdma_cm_id *id) { ibv_destroy_srq(id->srq); - if (!id->qp) - ucma_destroy_cqs(id); id->srq = NULL; + ucma_destroy_cqs(id); } -int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, - struct ibv_qp_init_attr *qp_init_attr) +int rdma_create_qp_ex(struct rdma_cm_id *id, + struct ibv_qp_init_attr_ex *attr) { struct cma_id_private *id_priv; struct ibv_qp *qp; @@ -1292,21 +1344,37 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, return ERR(EINVAL); id_priv = container_of(id, struct cma_id_private, id); - if (!pd) - pd = id->pd; - else if (id->verbs != pd->context) + if (!(attr->comp_mask & IBV_QP_INIT_ATTR_PD) || !attr->pd) { + attr->comp_mask |= IBV_QP_INIT_ATTR_PD; + attr->pd = id->pd; + } else if (id->verbs != attr->pd->context) return ERR(EINVAL); - ret = ucma_create_cqs(id, qp_init_attr->send_cq ? 0 : qp_init_attr->cap.max_send_wr, - qp_init_attr->recv_cq ? 0 : qp_init_attr->cap.max_recv_wr); + if ((id->recv_cq && attr->recv_cq && id->recv_cq != attr->recv_cq) || + (id->send_cq && attr->send_cq && id->send_cq != attr->send_cq)) + return ERR(EINVAL); + + if (id->qp_type == IBV_QPT_XRC_RECV) { + if (!(attr->comp_mask & IBV_QP_INIT_ATTR_XRCD) || !attr->xrcd) { + attr->xrcd = ucma_get_xrcd(id_priv->cma_dev); + if (!attr->xrcd) + return -1; + attr->comp_mask |= IBV_QP_INIT_ATTR_XRCD; + } + } + + ret = ucma_create_cqs(id, attr->send_cq || id->send_cq ? 0 : attr->cap.max_send_wr, + attr->recv_cq || id->recv_cq ? 0 : attr->cap.max_recv_wr); if (ret) return ret; - if (!qp_init_attr->send_cq) - qp_init_attr->send_cq = id->send_cq; - if (!qp_init_attr->recv_cq) - qp_init_attr->recv_cq = id->recv_cq; - qp = ibv_create_qp(pd, qp_init_attr); + if (!attr->send_cq) + attr->send_cq = id->send_cq; + if (!attr->recv_cq) + attr->recv_cq = id->recv_cq; + if (id->srq && !attr->srq) + attr->srq = id->srq; + qp = ibv_create_qp_ex(id->verbs, attr); if (!qp) { ret = ERR(ENOMEM); goto err1; @@ -1319,7 +1387,7 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, if (ret) goto err2; - id->pd = pd; + id->pd = qp->pd; id->qp = qp; return 0; err2: @@ -1329,11 +1397,25 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, return ret; } +int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, + struct ibv_qp_init_attr *qp_init_attr) +{ + struct ibv_qp_init_attr_ex attr_ex; + int ret; + + memcpy(&attr_ex, qp_init_attr, sizeof *qp_init_attr); + attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; + attr_ex.pd = pd ? pd : id->pd; + ret = rdma_create_qp_ex(id, &attr_ex); + memcpy(qp_init_attr, &attr_ex, sizeof *qp_init_attr); + return ret; +} + void rdma_destroy_qp(struct rdma_cm_id *id) { ibv_destroy_qp(id->qp); - ucma_destroy_cqs(id); id->qp = NULL; + ucma_destroy_cqs(id); } static int ucma_valid_param(struct cma_id_private *id_priv, diff --git a/librdmacm/src/cma.h b/librdmacm/src/cma.h index a7bab0f2a92..98eba8dc21e 100644 --- a/librdmacm/src/cma.h +++ b/librdmacm/src/cma.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005-2012 Intel Corporation. All rights reserved. + * Copyright (c) 2005-2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -167,46 +167,6 @@ void ucma_ib_init(); void ucma_ib_cleanup(); void ucma_ib_resolve(struct rdma_addrinfo **rai, struct rdma_addrinfo *hints); -/* Define path record definition if using older version of libibverbs */ -#ifdef DEFINE_PATH_RECORD -#define IBV_PATH_RECORD_REVERSIBLE 0x80 - -struct ibv_path_record -{ - uint64_t service_id; - union ibv_gid dgid; - union ibv_gid sgid; - uint16_t dlid; - uint16_t slid; - uint32_t flowlabel_hoplimit; /* resv-31:28 flow label-27:8 hop limit-7:0*/ - uint8_t tclass; - uint8_t reversible_numpath; /* reversible-7:7 num path-6:0 */ - uint16_t pkey; - uint16_t qosclass_sl; /* qos class-15:4 sl-3:0 */ - uint8_t mtu; /* mtu selector-7:6 mtu-5:0 */ - uint8_t rate; /* rate selector-7:6 rate-5:0 */ - uint8_t packetlifetime; /* lifetime selector-7:6 lifetime-5:0 */ - uint8_t preference; - uint8_t reserved[6]; -}; - -#define IBV_PATH_FLAG_GMP (1<<0) -#define IBV_PATH_FLAG_PRIMARY (1<<1) -#define IBV_PATH_FLAG_ALTERNATE (1<<2) -#define IBV_PATH_FLAG_OUTBOUND (1<<3) -#define IBV_PATH_FLAG_INBOUND (1<<4) -#define IBV_PATH_FLAG_INBOUND_REVERSE (1<<5) -#define IBV_PATH_FLAG_BIDIRECTIONAL (IBV_PATH_FLAG_OUTBOUND | \ - IBV_PATH_FLAG_INBOUND_REVERSE) - -struct ibv_path_data -{ - uint32_t flags; - uint32_t reserved; - struct ibv_path_record path; -}; -#endif - struct ib_connect_hdr { uint8_t cma_version; uint8_t ip_version; /* IP version: 7:4 */ diff --git a/librdmacm/src/indexer.c b/librdmacm/src/indexer.c index be2e69c895c..f9042f56bc2 100644 --- a/librdmacm/src/indexer.c +++ b/librdmacm/src/indexer.c @@ -151,6 +151,7 @@ int idm_set(struct index_map *idm, int index, void *item) entry = idm->array[idx_array_index(index)]; entry[idx_entry_index(index)] = item; + idm->count[idx_array_index(index)]++; return index; } @@ -162,5 +163,9 @@ void *idm_clear(struct index_map *idm, int index) entry = idm->array[idx_array_index(index)]; item = entry[idx_entry_index(index)]; entry[idx_entry_index(index)] = NULL; + if (--idm->count[idx_array_index(index)] == 0) { + free(idm->array[idx_array_index(index)]); + idm->array[idx_array_index(index)] = NULL; + } return item; } diff --git a/librdmacm/src/indexer.h b/librdmacm/src/indexer.h index 0c5f3882673..fc8eae248c1 100644 --- a/librdmacm/src/indexer.h +++ b/librdmacm/src/indexer.h @@ -85,6 +85,7 @@ static inline void *idx_at(struct indexer *idx, int index) struct index_map { void **array[IDX_ARRAY_SIZE]; + int count[IDX_ARRAY_SIZE]; }; int idm_set(struct index_map *idm, int index, void *item); diff --git a/librdmacm/src/librdmacm.map b/librdmacm/src/librdmacm.map index d5ef7363ca7..ffbd199d311 100644 --- a/librdmacm/src/librdmacm.map +++ b/librdmacm/src/librdmacm.map @@ -66,5 +66,7 @@ RDMACM_1.0 { riomap; riounmap; riowrite; + rdma_create_srq_ex; + rdma_create_qp_ex; local: *; }; diff --git a/librdmacm/src/rsocket.c b/librdmacm/src/rsocket.c index 30ea55ddcd8..7007897b170 100644 --- a/librdmacm/src/rsocket.c +++ b/librdmacm/src/rsocket.c @@ -59,8 +59,9 @@ #define RS_OLAP_START_SIZE 2048 #define RS_MAX_TRANSFER 65536 #define RS_SNDLOWAT 2048 +#define RS_QP_MIN_SIZE 16 #define RS_QP_MAX_SIZE 0xFFFE -#define RS_QP_CTRL_SIZE 4 +#define RS_QP_CTRL_SIZE 4 /* must be power of 2 */ #define RS_CONN_RETRIES 6 #define RS_SGL_SIZE 2 static struct index_map idm; @@ -155,6 +156,7 @@ enum { enum { RS_CTRL_DISCONNECT, + RS_CTRL_KEEPALIVE, RS_CTRL_SHUTDOWN }; @@ -194,7 +196,7 @@ struct rs_iomap_mr { int index; /* -1 if mapping is local and not in iomap_list */ }; -#define RS_MIN_INLINE (sizeof(struct rs_sge)) +#define RS_MAX_CTRL_MSG (sizeof(struct rs_sge)) #define rs_host_is_net() (1 == htonl(1)) #define RS_CONN_FLAG_NET (1 << 0) #define RS_CONN_FLAG_IOMAP (1 << 1) @@ -307,7 +309,8 @@ struct rsocket { uint64_t tcp_opts; unsigned int keepalive_time; - int ctrl_avail; + unsigned int ctrl_seqno; + unsigned int ctrl_max_seqno; uint16_t sseq_no; uint16_t sseq_comp; uint16_t rseq_no; @@ -504,9 +507,6 @@ void rs_configure(void) if ((f = fopen(RS_CONF_DIR "/inline_default", "r"))) { (void) fscanf(f, "%hu", &def_inline); fclose(f); - - if (def_inline < RS_MIN_INLINE) - def_inline = RS_MIN_INLINE; } if ((f = fopen(RS_CONF_DIR "/sqsize_default", "r"))) { @@ -562,6 +562,7 @@ static void rs_remove(struct rsocket *rs) pthread_mutex_unlock(&mut); } +/* We only inherit from listening sockets */ static struct rsocket *rs_alloc(struct rsocket *inherited_rs, int type) { struct rsocket *rs; @@ -584,7 +585,7 @@ static struct rsocket *rs_alloc(struct rsocket *inherited_rs, int type) rs->sq_size = inherited_rs->sq_size; rs->rq_size = inherited_rs->rq_size; if (type == SOCK_STREAM) { - rs->ctrl_avail = inherited_rs->ctrl_avail; + rs->ctrl_max_seqno = inherited_rs->ctrl_max_seqno; rs->target_iomap_size = inherited_rs->target_iomap_size; } } else { @@ -594,7 +595,7 @@ static struct rsocket *rs_alloc(struct rsocket *inherited_rs, int type) rs->sq_size = def_sqsize; rs->rq_size = def_rqsize; if (type == SOCK_STREAM) { - rs->ctrl_avail = RS_QP_CTRL_SIZE; + rs->ctrl_max_seqno = RS_QP_CTRL_SIZE; rs->target_iomap_size = def_iomap_size; } } @@ -642,15 +643,13 @@ static void rs_set_qp_size(struct rsocket *rs) if (rs->sq_size > max_size) rs->sq_size = max_size; - else if (rs->sq_size < 4) - rs->sq_size = 4; - if (rs->sq_size <= (RS_QP_CTRL_SIZE << 2)) - rs->ctrl_avail = 2; + else if (rs->sq_size < RS_QP_MIN_SIZE) + rs->sq_size = RS_QP_MIN_SIZE; if (rs->rq_size > max_size) rs->rq_size = max_size; - else if (rs->rq_size < 4) - rs->rq_size = 4; + else if (rs->rq_size < RS_QP_MIN_SIZE) + rs->rq_size = RS_QP_MIN_SIZE; } static void ds_set_qp_size(struct rsocket *rs) @@ -677,18 +676,21 @@ static void ds_set_qp_size(struct rsocket *rs) static int rs_init_bufs(struct rsocket *rs) { - uint32_t rbuf_msg_size; + uint32_t total_rbuf_size, total_sbuf_size; size_t len; rs->rmsg = calloc(rs->rq_size + 1, sizeof(*rs->rmsg)); if (!rs->rmsg) return ERR(ENOMEM); - rs->sbuf = calloc(rs->sbuf_size, sizeof(*rs->sbuf)); + total_sbuf_size = rs->sbuf_size; + if (rs->sq_inline < RS_MAX_CTRL_MSG) + total_sbuf_size += RS_MAX_CTRL_MSG * RS_QP_CTRL_SIZE; + rs->sbuf = calloc(total_sbuf_size, 1); if (!rs->sbuf) return ERR(ENOMEM); - rs->smr = rdma_reg_msgs(rs->cm_id, rs->sbuf, rs->sbuf_size); + rs->smr = rdma_reg_msgs(rs->cm_id, rs->sbuf, total_sbuf_size); if (!rs->smr) return -1; @@ -707,14 +709,14 @@ static int rs_init_bufs(struct rsocket *rs) if (rs->target_iomap_size) rs->target_iomap = (struct rs_iomap *) (rs->target_sgl + RS_SGL_SIZE); - rbuf_msg_size = rs->rbuf_size; + total_rbuf_size = rs->rbuf_size; if (rs->opts & RS_OPT_MSG_SEND) - rbuf_msg_size += rs->rq_size * RS_MSG_SIZE; - rs->rbuf = calloc(rbuf_msg_size, 1); + total_rbuf_size += rs->rq_size * RS_MSG_SIZE; + rs->rbuf = calloc(total_rbuf_size, 1); if (!rs->rbuf) return ERR(ENOMEM); - rs->rmr = rdma_reg_write(rs->cm_id, rs->rbuf, rbuf_msg_size); + rs->rmr = rdma_reg_write(rs->cm_id, rs->rbuf, total_rbuf_size); if (!rs->rmr) return -1; @@ -724,7 +726,7 @@ static int rs_init_bufs(struct rsocket *rs) rs->rbuf_free_offset = rs->rbuf_size >> 1; rs->rbuf_bytes_avail = rs->rbuf_size >> 1; - rs->sqe_avail = rs->sq_size - rs->ctrl_avail; + rs->sqe_avail = rs->sq_size - rs->ctrl_max_seqno; rs->rseq_comp = rs->rq_size >> 1; return 0; } @@ -860,6 +862,10 @@ static int rs_create_ep(struct rsocket *rs) if (ret) return ret; + rs->sq_inline = qp_attr.cap.max_inline_data; + if ((rs->opts & RS_OPT_MSG_SEND) && (rs->sq_inline < RS_MSG_SIZE)) + return ERR(ENOTSUP); + for (i = 0; i < rs->rq_size; i++) { ret = rs_post_recv(rs); if (ret) @@ -1491,6 +1497,7 @@ static int ds_create_qp(struct rsocket *rs, union socket_addr *src_addr, if (ret) goto err; + rs->sq_inline = qp_attr.cap.max_inline_data; ret = ds_add_qp_dest(qp, src_addr, addrlen); if (ret) goto err; @@ -1602,6 +1609,12 @@ int rconnect(int socket, const struct sockaddr *addr, socklen_t addrlen) return ret; } +static void *rs_get_ctrl_buf(struct rsocket *rs) +{ + return rs->sbuf + rs->sbuf_size + + RS_MAX_CTRL_MSG * (rs->ctrl_seqno & (RS_QP_CTRL_SIZE - 1)); +} + static int rs_post_msg(struct rsocket *rs, uint32_t msg) { struct ibv_send_wr wr, *bad; @@ -1763,7 +1776,7 @@ static int rs_write_iomap(struct rsocket *rs, struct rs_iomap_mr *iomr, addr = rs->remote_iomap.addr + iomr->index * sizeof(struct rs_iomap); return rs_post_write_msg(rs, sgl, nsge, rs_msg_set(RS_OP_IOMAP_SGL, iomr->index), - flags, addr, rs->remote_iomap.key); + flags, addr, rs->remote_iomap.key); } static uint32_t rs_sbuf_left(struct rsocket *rs) @@ -1775,13 +1788,14 @@ static uint32_t rs_sbuf_left(struct rsocket *rs) static void rs_send_credits(struct rsocket *rs) { struct ibv_sge ibsge; - struct rs_sge sge; + struct rs_sge sge, *sge_buf; + int flags; - rs->ctrl_avail--; + rs->ctrl_seqno++; rs->rseq_comp = rs->rseq_no + (rs->rq_size >> 1); if (rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) { if (rs->opts & RS_OPT_MSG_SEND) - rs->ctrl_avail--; + rs->ctrl_seqno++; if (!(rs->opts & RS_OPT_SWAP_SGL)) { sge.addr = (uintptr_t) &rs->rbuf[rs->rbuf_free_offset]; @@ -1793,16 +1807,23 @@ static void rs_send_credits(struct rsocket *rs) sge.length = bswap_32(rs->rbuf_size >> 1); } - ibsge.addr = (uintptr_t) &sge; - ibsge.lkey = 0; + if (rs->sq_inline < sizeof sge) { + sge_buf = rs_get_ctrl_buf(rs); + memcpy(sge_buf, &sge, sizeof sge); + ibsge.addr = (uintptr_t) sge_buf; + ibsge.lkey = rs->smr->lkey; + flags = 0; + } else { + ibsge.addr = (uintptr_t) &sge; + ibsge.lkey = 0; + flags = IBV_SEND_INLINE; + } ibsge.length = sizeof(sge); rs_post_write_msg(rs, &ibsge, 1, - rs_msg_set(RS_OP_SGL, rs->rseq_no + rs->rq_size), - IBV_SEND_INLINE, - rs->remote_sgl.addr + - rs->remote_sge * sizeof(struct rs_sge), - rs->remote_sgl.key); + rs_msg_set(RS_OP_SGL, rs->rseq_no + rs->rq_size), flags, + rs->remote_sgl.addr + rs->remote_sge * sizeof(struct rs_sge), + rs->remote_sgl.key); rs->rbuf_bytes_avail -= rs->rbuf_size >> 1; rs->rbuf_free_offset += rs->rbuf_size >> 1; @@ -1815,16 +1836,27 @@ static void rs_send_credits(struct rsocket *rs) } } +static inline int rs_ctrl_avail(struct rsocket *rs) +{ + return rs->ctrl_seqno != rs->ctrl_max_seqno; +} + +/* Protocols that do not support RDMA write with immediate may require 2 msgs */ +static inline int rs_2ctrl_avail(struct rsocket *rs) +{ + return (int)((rs->ctrl_seqno + 1) - rs->ctrl_max_seqno) < 0; +} + static int rs_give_credits(struct rsocket *rs) { if (!(rs->opts & RS_OPT_MSG_SEND)) { return ((rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) || ((short) ((short) rs->rseq_no - (short) rs->rseq_comp) >= 0)) && - rs->ctrl_avail && (rs->state & rs_connected); + rs_ctrl_avail(rs) && (rs->state & rs_connected); } else { return ((rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) || ((short) ((short) rs->rseq_no - (short) rs->rseq_comp) >= 0)) && - (rs->ctrl_avail > 1) && (rs->state & rs_connected); + rs_2ctrl_avail(rs) && (rs->state & rs_connected); } } @@ -1886,10 +1918,10 @@ static int rs_poll_cq(struct rsocket *rs) } else { switch (rs_msg_op(rs_wr_data(wc.wr_id))) { case RS_OP_SGL: - rs->ctrl_avail++; + rs->ctrl_max_seqno++; break; case RS_OP_CTRL: - rs->ctrl_avail++; + rs->ctrl_max_seqno++; if (rs_msg_data(rs_wr_data(wc.wr_id)) == RS_CTRL_DISCONNECT) rs->state = rs_disconnected; break; @@ -2228,7 +2260,7 @@ static int rs_conn_can_send(struct rsocket *rs) static int rs_conn_can_send_ctrl(struct rsocket *rs) { - return rs->ctrl_avail || !(rs->state & rs_connected); + return rs_ctrl_avail(rs) || !(rs->state & rs_connected); } static int rs_have_rdata(struct rsocket *rs) @@ -2243,7 +2275,8 @@ static int rs_conn_have_rdata(struct rsocket *rs) static int rs_conn_all_sends_done(struct rsocket *rs) { - return ((rs->sqe_avail + rs->ctrl_avail) == rs->sq_size) || + return ((((int) rs->ctrl_max_seqno) - ((int) rs->ctrl_seqno)) + + rs->sqe_avail == rs->sq_size) || !(rs->state & rs_connected); } @@ -3180,14 +3213,14 @@ int rshutdown(int socket, int how) goto out; ctrl = RS_CTRL_DISCONNECT; } - if (!rs->ctrl_avail) { + if (!rs_ctrl_avail(rs)) { ret = rs_process_cq(rs, 0, rs_conn_can_send_ctrl); if (ret) goto out; } - if ((rs->state & rs_connected) && rs->ctrl_avail) { - rs->ctrl_avail--; + if ((rs->state & rs_connected) && rs_ctrl_avail(rs)) { + rs->ctrl_seqno++; ret = rs_post_msg(rs, rs_msg_set(RS_OP_CTRL, ctrl)); } } @@ -3233,6 +3266,8 @@ int rclose(int socket) if (rs->type == SOCK_STREAM) { if (rs->state & rs_connected) rshutdown(socket, SHUT_RDWR); + else if (rs->opts & RS_OPT_SVC_ACTIVE) + rs_notify_svc(&tcp_svc, rs, RS_SVC_REM_KEEPALIVE); } else { ds_shutdown(rs); } @@ -3433,8 +3468,6 @@ int rsetsockopt(int socket, int level, int optname, break; case RDMA_INLINE: rs->sq_inline = min(*(uint32_t *) optval, RS_QP_MAX_SIZE); - if (rs->sq_inline < RS_MIN_INLINE) - rs->sq_inline = RS_MIN_INLINE; ret = 0; break; case RDMA_IOMAPSIZE: @@ -3443,7 +3476,7 @@ int rsetsockopt(int socket, int level, int optname, ret = 0; break; case RDMA_ROUTE: - if ((rs->optval = calloc(optlen, 1))) { + if ((rs->optval = malloc(optlen))) { memcpy(rs->optval, optval, optlen); rs->optlen = optlen; ret = 0; @@ -3469,11 +3502,38 @@ int rsetsockopt(int socket, int level, int optname, return ret; } +static void rs_convert_sa_path(struct ibv_sa_path_rec *sa_path, + struct ibv_path_data *path_data) +{ + uint32_t fl_hop; + + memset(path_data, 0, sizeof(*path_data)); + path_data->path.dgid = sa_path->dgid; + path_data->path.sgid = sa_path->sgid; + path_data->path.dlid = sa_path->dlid; + path_data->path.slid = sa_path->slid; + fl_hop = ntohl(sa_path->flow_label) << 8; + path_data->path.flowlabel_hoplimit = htonl(fl_hop) | sa_path->hop_limit; + path_data->path.tclass = sa_path->traffic_class; + path_data->path.reversible_numpath = sa_path->reversible << 7 | 1; + path_data->path.pkey = sa_path->pkey; + path_data->path.qosclass_sl = sa_path->sl; + path_data->path.mtu = sa_path->mtu | 2 << 6; /* exactly */ + path_data->path.rate = sa_path->rate | 2 << 6; + path_data->path.packetlifetime = sa_path->packet_life_time | 2 << 6; + path_data->flags= sa_path->preference; +} + int rgetsockopt(int socket, int level, int optname, void *optval, socklen_t *optlen) { struct rsocket *rs; + void *opt; + struct ibv_sa_path_rec *path_rec; + struct ibv_path_data path_data; + socklen_t len; int ret = 0; + int num_paths; rs = idm_lookup(&idm, socket); if (!rs) @@ -3566,6 +3626,36 @@ int rgetsockopt(int socket, int level, int optname, *((int *) optval) = rs->target_iomap_size; *optlen = sizeof(int); break; + case RDMA_ROUTE: + if (rs->optval) { + if (*optlen < rs->optlen) { + ret = EINVAL; + } else { + memcpy(rs->optval, optval, rs->optlen); + *optlen = rs->optlen; + } + } else { + if (*optlen < sizeof(path_data)) { + ret = EINVAL; + } else { + len = 0; + opt = optval; + path_rec = rs->cm_id->route.path_rec; + num_paths = 0; + while (len + sizeof(path_data) <= *optlen && + num_paths < rs->cm_id->route.num_paths) { + rs_convert_sa_path(path_rec, &path_data); + memcpy(opt, &path_data, sizeof(path_data)); + len += sizeof(path_data); + opt += sizeof(path_data); + path_rec++; + num_paths++; + } + *optlen = len; + ret = 0; + } + } + break; default: ret = ENOTSUP; break; @@ -3825,8 +3915,8 @@ static int rs_svc_grow_sets(struct rs_svc *svc, int grow_size) rss = set; contexts = set + sizeof(*rss) * svc->size; if (svc->cnt) { - memcpy(rss, svc->rss, sizeof(*rss) * svc->cnt); - memcpy(contexts, svc->contexts, svc->context_size * svc->cnt); + memcpy(rss, svc->rss, sizeof(*rss) * (svc->cnt + 1)); + memcpy(contexts, svc->contexts, svc->context_size * (svc->cnt + 1)); } free(svc->rss); @@ -3852,19 +3942,28 @@ static int rs_svc_add_rs(struct rs_svc *svc, struct rsocket *rs) return 0; } -static int rs_svc_rm_rs(struct rs_svc *svc, struct rsocket *rs) +static int rs_svc_index(struct rs_svc *svc, struct rsocket *rs) { int i; for (i = 1; i <= svc->cnt; i++) { - if (svc->rss[i] == rs) { - svc->cnt--; - svc->rss[i] = svc->rss[svc->cnt]; - memcpy(svc->contexts + i * svc->context_size, - svc->contexts + svc->cnt * svc->context_size, - svc->context_size); - return 0; - } + if (svc->rss[i] == rs) + return i; + } + return -1; +} + +static int rs_svc_rm_rs(struct rs_svc *svc, struct rsocket *rs) +{ + int i; + + if ((i = rs_svc_index(svc, rs)) >= 0) { + svc->rss[i] = svc->rss[svc->cnt]; + memcpy(svc->contexts + i * svc->context_size, + svc->contexts + svc->cnt * svc->context_size, + svc->context_size); + svc->cnt--; + return 0; } return EBADF; } @@ -4108,6 +4207,7 @@ static uint32_t rs_get_time(void) static void tcp_svc_process_sock(struct rs_svc *svc) { struct rs_svc_msg msg; + int i; read(svc->sock[1], &msg, sizeof msg); switch (msg.cmd) { @@ -4126,8 +4226,13 @@ static void tcp_svc_process_sock(struct rs_svc *svc) msg.rs->opts &= ~RS_OPT_SVC_ACTIVE; break; case RS_SVC_MOD_KEEPALIVE: - tcp_svc_timeouts[svc->cnt] = rs_get_time() + msg.rs->keepalive_time; - msg.status = 0; + i = rs_svc_index(svc, msg.rs); + if (i >= 0) { + tcp_svc_timeouts[i] = rs_get_time() + msg.rs->keepalive_time; + msg.status = 0; + } else { + msg.status = EBADF; + } break; case RS_SVC_NOOP: msg.status = 0; @@ -4139,18 +4244,17 @@ static void tcp_svc_process_sock(struct rs_svc *svc) } /* - * Send a credit update as the keep-alive message. We may or may not have - * any credits, but if we do, then we require a minimum of 2 control credits - * for protocols that do not support RDMA write with immediate data. There's - * no need to send a keep-alive message if we have any messages outstanding, - * and we start with a minimum of 2 credits. For simplicity, we just check - * that both credits are available before sending the keep-alive. + * Send a 0 byte RDMA write with immediate as keep-alive message. + * This avoids the need for the receive side to do any acknowledgment. */ static void tcp_svc_send_keepalive(struct rsocket *rs) { fastlock_acquire(&rs->cq_lock); - if ((rs->ctrl_avail > 1) && (rs->state & rs_connected)) - rs_send_credits(rs); + if (rs_ctrl_avail(rs) && (rs->state & rs_connected)) { + rs->ctrl_seqno++; + rs_post_write(rs, NULL, 0, rs_msg_set(RS_OP_CTRL, RS_CTRL_KEEPALIVE), + 0, (uint64_t) NULL, (uint64_t) NULL); + } fastlock_release(&rs->cq_lock); }