diff --git a/providers/mana/CMakeLists.txt b/providers/mana/CMakeLists.txt index 7219ee2ec..05011be02 100644 --- a/providers/mana/CMakeLists.txt +++ b/providers/mana/CMakeLists.txt @@ -4,6 +4,8 @@ rdma_shared_provider(mana libmana.map manadv.c qp.c wq.c + cq.c + wr.c ) publish_headers(infiniband diff --git a/providers/mana/cq.c b/providers/mana/cq.c new file mode 100644 index 000000000..bb9a1b0f1 --- /dev/null +++ b/providers/mana/cq.c @@ -0,0 +1,362 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "mana.h" +#include "gdma.h" +#include "doorbells.h" +#include "rollback.h" +#define INITIALIZED_OWNER_BIT(log2_num_entries) (1UL << (log2_num_entries)) + +DECLARE_DRV_CMD(mana_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, + mana_ib_create_cq, mana_ib_create_cq_resp); + +struct ibv_cq *mana_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, int comp_vector) +{ + struct mana_context *ctx = to_mctx(context); + struct mana_create_cq_resp resp = {}; + struct mana_ib_create_cq *cmd_drv; + struct mana_create_cq cmd = {}; + struct mana_cq *cq; + uint16_t flags = 0; + size_t cq_size; + int ret; + + cq = calloc(1, sizeof(*cq)); + if (!cq) + return NULL; + + cq_size = align_hw_size(cqe * COMP_ENTRY_SIZE); + cq->db_page = ctx->db_page; + list_head_init(&cq->send_qp_list); + list_head_init(&cq->recv_qp_list); + pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); + + cq->buf_external = ctx->extern_alloc.alloc && ctx->extern_alloc.free; + if (!cq->buf_external) + flags |= MANA_IB_CREATE_RNIC_CQ; + + if (cq->buf_external) + cq->buf = ctx->extern_alloc.alloc(cq_size, ctx->extern_alloc.data); + else + cq->buf = mana_alloc_mem(cq_size); + if (!cq->buf) { + errno = ENOMEM; + goto free_cq; + } + + if (flags & MANA_IB_CREATE_RNIC_CQ) + cq->cqe = cq_size / COMP_ENTRY_SIZE; + else + cq->cqe = cqe; // to preserve old behaviour for DPDK + cq->head = INITIALIZED_OWNER_BIT(ilog32(cq->cqe) - 1); + cq->last_armed_head = cq->head - 1; + cq->ready_wcs = 0; + + cmd_drv = &cmd.drv_payload; + cmd_drv->buf_addr = (uintptr_t)cq->buf; + cmd_drv->flags = flags; + resp.cqid = UINT32_MAX; + + ret = ibv_cmd_create_cq(context, cq->cqe, channel, comp_vector, + &cq->ibcq, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + + if (ret) { + verbs_err(verbs_get_ctx(context), "Failed to Create CQ\n"); + errno = ret; + goto free_mem; + } + + if (flags & MANA_IB_CREATE_RNIC_CQ) { + cq->cqid = resp.cqid; + if (cq->cqid == UINT32_MAX) { + errno = ENODEV; + goto destroy_cq; + } + } + + return &cq->ibcq; + +destroy_cq: + ibv_cmd_destroy_cq(&cq->ibcq); +free_mem: + if (cq->buf_external) + ctx->extern_alloc.free(cq->buf, ctx->extern_alloc.data); + else + munmap(cq->buf, cq_size); +free_cq: + free(cq); + return NULL; +} + +int mana_destroy_cq(struct ibv_cq *ibcq) +{ + struct mana_cq *cq = container_of(ibcq, struct mana_cq, ibcq); + struct mana_context *ctx = to_mctx(ibcq->context); + int ret; + + pthread_spin_lock(&cq->lock); + ret = ibv_cmd_destroy_cq(ibcq); + if (ret) { + verbs_err(verbs_get_ctx(ibcq->context), + "Failed to Destroy CQ\n"); + pthread_spin_unlock(&cq->lock); + return ret; + } + pthread_spin_destroy(&cq->lock); + + if (cq->buf_external) + ctx->extern_alloc.free(cq->buf, ctx->extern_alloc.data); + else + munmap(cq->buf, cq->cqe * COMP_ENTRY_SIZE); + + free(cq); + + return ret; +} + +int mana_arm_cq(struct ibv_cq *ibcq, int solicited) +{ + struct mana_cq *cq = container_of(ibcq, struct mana_cq, ibcq); + + if (solicited) + return -EOPNOTSUPP; + if (cq->cqid == UINT32_MAX) + return -EINVAL; + + gdma_ring_cq_doorbell(cq); + return 0; +} + +static inline uint32_t handle_rc_requester_cqe(struct mana_qp *qp, struct gdma_cqe *cqe) +{ + struct mana_gdma_queue *recv_queue = &qp->rc_qp.queues[USER_RC_RECV_QUEUE_REQUESTER]; + struct mana_gdma_queue *send_queue = &qp->rc_qp.queues[USER_RC_SEND_QUEUE_REQUESTER]; + uint32_t syndrome = cqe->rdma_cqe.rc_armed_completion.syndrome; + uint32_t psn = cqe->rdma_cqe.rc_armed_completion.psn; + struct rc_sq_shadow_wqe *shadow_wqe; + uint32_t wcs = 0; + + if (!IB_IS_ACK(syndrome)) + return 0; + + if (!PSN_GT(psn, qp->rc_qp.sq_highest_completed_psn)) + return 0; + + qp->rc_qp.sq_highest_completed_psn = psn; + + if (!PSN_LT(psn, qp->rc_qp.sq_psn)) + return 0; + + while ((shadow_wqe = (struct rc_sq_shadow_wqe *) + shadow_queue_get_next_to_complete(&qp->shadow_sq)) != NULL) { + if (PSN_LT(psn, shadow_wqe->end_psn)) + break; + + send_queue->cons_idx += shadow_wqe->header.posted_wqe_size_in_bu; + send_queue->cons_idx &= GDMA_QUEUE_OFFSET_MASK; + + recv_queue->cons_idx += shadow_wqe->read_posted_wqe_size_in_bu; + recv_queue->cons_idx &= GDMA_QUEUE_OFFSET_MASK; + + uint32_t offset = shadow_wqe->header.unmasked_queue_offset + + shadow_wqe->header.posted_wqe_size_in_bu; + mana_ib_update_shared_mem_left_offset(qp, offset & GDMA_QUEUE_OFFSET_MASK); + + shadow_queue_advance_next_to_complete(&qp->shadow_sq); + if (shadow_wqe->header.flags != MANA_NO_SIGNAL_WC) + wcs++; + } + + uint32_t prev_psn = PSN_DEC(qp->rc_qp.sq_psn); + + if (qp->rc_qp.sq_highest_completed_psn == prev_psn) + gdma_arm_normal_cqe(recv_queue, qp->rc_qp.sq_psn); + else + gdma_arm_normal_cqe(recv_queue, prev_psn); + + return wcs; +} + +static inline uint32_t handle_rc_responder_cqe(struct mana_qp *qp, struct gdma_cqe *cqe) +{ + struct mana_gdma_queue *recv_queue = &qp->rc_qp.queues[USER_RC_RECV_QUEUE_RESPONDER]; + struct rc_rq_shadow_wqe *shadow_wqe; + + shadow_wqe = (struct rc_rq_shadow_wqe *)shadow_queue_get_next_to_complete(&qp->shadow_rq); + if (!shadow_wqe) + return 0; + + uint32_t offset_cqe = cqe->rdma_cqe.rc_recv.rx_wqe_offset / GDMA_WQE_ALIGNMENT_UNIT_SIZE; + uint32_t offset_wqe = shadow_wqe->header.unmasked_queue_offset & GDMA_QUEUE_OFFSET_MASK; + + if (offset_cqe != offset_wqe) + return 0; + + shadow_wqe->byte_len = cqe->rdma_cqe.rc_recv.msg_len; + shadow_wqe->imm_or_rkey = cqe->rdma_cqe.rc_recv.imm_data; + + switch (cqe->rdma_cqe.cqe_type) { + case CQE_TYPE_RC_WRITE_IMM: + shadow_wqe->header.opcode = IBV_WC_RECV_RDMA_WITH_IMM; + SWITCH_FALLTHROUGH; + case CQE_TYPE_RC_SEND_IMM: + shadow_wqe->header.flags |= IBV_WC_WITH_IMM; + break; + case CQE_TYPE_RC_SEND_INV: + shadow_wqe->header.flags |= IBV_WC_WITH_INV; + break; + default: + break; + } + + recv_queue->cons_idx += shadow_wqe->header.posted_wqe_size_in_bu; + recv_queue->cons_idx &= GDMA_QUEUE_OFFSET_MASK; + + shadow_queue_advance_next_to_complete(&qp->shadow_rq); + return 1; +} + +static inline uint32_t mana_handle_cqe(struct mana_context *ctx, struct gdma_cqe *cqe) +{ + struct mana_qp *qp; + + if (cqe->is_sq) // impossible for rc + return 0; + + qp = mana_get_qp_from_rq(ctx, cqe->wqid); + if (!qp) + return 0; + + if (cqe->rdma_cqe.cqe_type == CQE_TYPE_ARMED_CMPL) + return handle_rc_requester_cqe(qp, cqe); + else + return handle_rc_responder_cqe(qp, cqe); +} + +static inline int gdma_read_cqe(struct mana_cq *cq, struct gdma_cqe *cqe) +{ + uint32_t new_entry_owner_bits; + uint32_t old_entry_owner_bits; + struct gdma_cqe *current_cqe; + uint32_t owner_bits; + + current_cqe = ((struct gdma_cqe *)cq->buf) + (cq->head % cq->cqe); + new_entry_owner_bits = (cq->head / cq->cqe) & CQ_OWNER_MASK; + old_entry_owner_bits = (cq->head / cq->cqe - 1) & CQ_OWNER_MASK; + owner_bits = current_cqe->owner_bits; + + if (owner_bits == old_entry_owner_bits) + return 0; /* no new entry */ + if (owner_bits != new_entry_owner_bits) + return -1; /*overflow detected*/ + + udma_from_device_barrier(); + *cqe = *current_cqe; + cq->head++; + return 1; +} + +static void fill_verbs_from_shadow_wqe(struct mana_qp *qp, struct ibv_wc *wc, + const struct shadow_wqe_header *shadow_wqe) +{ + const struct rc_rq_shadow_wqe *rc_wqe = (const struct rc_rq_shadow_wqe *)shadow_wqe; + + wc->wr_id = shadow_wqe->wr_id; + wc->status = IBV_WC_SUCCESS; + wc->opcode = shadow_wqe->opcode; + wc->vendor_err = 0; + wc->wc_flags = shadow_wqe->flags; + wc->qp_num = qp->ibqp.qp.qp_num; + wc->pkey_index = 0; + + if (shadow_wqe->opcode & IBV_WC_RECV) { + wc->byte_len = rc_wqe->byte_len; + wc->imm_data = htobe32(rc_wqe->imm_or_rkey); + } +} + +static int mana_process_completions(struct mana_cq *cq, int nwc, struct ibv_wc *wc) +{ + struct shadow_wqe_header *shadow_wqe; + struct mana_qp *qp; + int wc_index = 0; + + /* process send shadow queue completions */ + list_for_each(&cq->send_qp_list, qp, send_cq_node) { + while ((shadow_wqe = shadow_queue_get_next_to_consume(&qp->shadow_sq)) + != NULL) { + if (wc_index >= nwc && shadow_wqe->flags != MANA_NO_SIGNAL_WC) + goto out; + + if (shadow_wqe->flags != MANA_NO_SIGNAL_WC) { + fill_verbs_from_shadow_wqe(qp, &wc[wc_index], shadow_wqe); + wc_index++; + } + shadow_queue_advance_consumer(&qp->shadow_sq); + } + } + + /* process recv shadow queue completions */ + list_for_each(&cq->recv_qp_list, qp, recv_cq_node) { + while ((shadow_wqe = shadow_queue_get_next_to_consume(&qp->shadow_rq)) + != NULL) { + if (wc_index >= nwc) + goto out; + + fill_verbs_from_shadow_wqe(qp, &wc[wc_index], shadow_wqe); + wc_index++; + shadow_queue_advance_consumer(&qp->shadow_rq); + } + } + +out: + return wc_index; +} + +int mana_poll_cq(struct ibv_cq *ibcq, int nwc, struct ibv_wc *wc) +{ + struct mana_cq *cq = container_of(ibcq, struct mana_cq, ibcq); + struct mana_context *ctx = to_mctx(ibcq->context); + struct gdma_cqe gdma_cqe; + int num_polled = 0; + int ret; + + pthread_spin_lock(&cq->lock); + + while (cq->ready_wcs < nwc) { + ret = gdma_read_cqe(cq, &gdma_cqe); + if (ret < 0) { + num_polled = -1; + goto out; + } + if (ret == 0) + break; + cq->ready_wcs += mana_handle_cqe(ctx, &gdma_cqe); + } + + num_polled = mana_process_completions(cq, nwc, wc); + cq->ready_wcs -= num_polled; +out: + pthread_spin_unlock(&cq->lock); + + return num_polled; +} diff --git a/providers/mana/doorbells.h b/providers/mana/doorbells.h new file mode 100644 index 000000000..3fce6726c --- /dev/null +++ b/providers/mana/doorbells.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#ifndef _DOORBELLS_H_ +#define _DOORBELLS_H_ + +#include +#include +#include "mana.h" + +#define GDMA_CQE_OWNER_BITS 3 +#define CQ_OWNER_MASK ((1 << (GDMA_CQE_OWNER_BITS)) - 1) + +#define DOORBELL_OFFSET_SQ 0x0 +#define DOORBELL_OFFSET_RQ 0x400 +#define DOORBELL_OFFSET_RQ_CLIENT 0x408 +#define DOORBELL_OFFSET_CQ 0x800 + +union gdma_doorbell_entry { + uint64_t as_uint64; + struct { + uint64_t id : 24; + uint64_t reserved : 8; + uint64_t prod_idx : 31; + uint64_t arm : 1; + } cq; + struct { + uint32_t id : 24; + uint32_t wqe_cnt : 8; + uint32_t prod_idx; + } rx; + struct { + uint32_t id : 24; + uint32_t reserved : 8; + uint32_t prod_idx; + } tx; + struct { + uint64_t id : 24; + uint64_t high : 8; + uint64_t low : 32; + } rqe_client; +}; /* HW DATA */ + +static inline void gdma_ring_recv_doorbell(struct mana_gdma_queue *wq, uint8_t wqe_cnt) +{ + union gdma_doorbell_entry e; + + e.as_uint64 = 0; + e.rx.id = wq->id; + e.rx.prod_idx = wq->prod_idx * GDMA_WQE_ALIGNMENT_UNIT_SIZE; + e.rx.wqe_cnt = wqe_cnt; + + udma_to_device_barrier(); + mmio_write64(wq->db_page + DOORBELL_OFFSET_RQ, e.as_uint64); + mmio_flush_writes(); +} + +static inline void gdma_ring_send_doorbell(struct mana_gdma_queue *wq) +{ + union gdma_doorbell_entry e; + + e.as_uint64 = 0; + e.tx.id = wq->id; + e.tx.prod_idx = wq->prod_idx * GDMA_WQE_ALIGNMENT_UNIT_SIZE; + + udma_to_device_barrier(); + mmio_write64(wq->db_page + DOORBELL_OFFSET_SQ, e.as_uint64); + mmio_flush_writes(); +} + +static inline void gdma_arm_normal_cqe(struct mana_gdma_queue *wq, uint32_t psn) +{ + union gdma_doorbell_entry e; + + e.as_uint64 = 0; + e.rqe_client.id = wq->id; + e.rqe_client.high = 1; + e.rqe_client.low = psn; + + udma_to_device_barrier(); + mmio_write64(wq->db_page + DOORBELL_OFFSET_RQ_CLIENT, e.as_uint64); + mmio_flush_writes(); +} + +static inline void gdma_ring_cq_doorbell(struct mana_cq *cq) +{ + union gdma_doorbell_entry e; + + // To address the use-case of ibv that re-arms the CQ without polling + if (cq->last_armed_head == cq->head) + cq->last_armed_head = cq->head + 1; + else + cq->last_armed_head = cq->head; + + e.as_uint64 = 0; + e.cq.id = cq->cqid; + e.cq.prod_idx = cq->last_armed_head % (cq->cqe << GDMA_CQE_OWNER_BITS); + e.cq.arm = 1; + + udma_to_device_barrier(); + mmio_write64(cq->db_page + DOORBELL_OFFSET_CQ, e.as_uint64); + mmio_flush_writes(); +} + +#endif //_DOORBELLS_H_ diff --git a/providers/mana/gdma.h b/providers/mana/gdma.h new file mode 100644 index 000000000..7c69cff92 --- /dev/null +++ b/providers/mana/gdma.h @@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ +#ifndef _GDMA_H_ +#define _GDMA_H_ + +#include +#include +#include +#include +#include +#include + +#define GDMA_QUEUE_OFFSET_WIDTH 27 +#define GDMA_QUEUE_OFFSET_MASK ((1 << GDMA_QUEUE_OFFSET_WIDTH) - 1) + +#define GDMA_COMP_DATA_SIZE 60 + +#define IB_SYNDROME_ACK(credits) (0x00 + (credits)) +#define IB_SYNDROME_RNR_NAK(timer) (0x20 + (timer)) +#define IB_SYNDROME_NAK(code) (0x60 + (code)) +#define IB_IS_ACK(syndrome) (((syndrome) & 0xE0) == IB_SYNDROME_ACK(0)) + +enum gdma_work_req_flags { + GDMA_WORK_REQ_NONE = 0, + GDMA_WORK_REQ_OOB_IN_SGL = BIT(0), + GDMA_WORK_REQ_SGL_DIRECT = BIT(1), + GDMA_WORK_REQ_CONSUME_CREDIT = BIT(2), + GDMA_WORK_REQ_FENCE = BIT(3), + GDMA_WORK_REQ_CHECK_SN = BIT(4), + GDMA_WORK_REQ_PAD_DATA_BY_FIRST_SGE_SIZE = BIT(5), + GDMA_WORK_REQ_EXTRA_LARGE_OOB = BIT(5), +}; + +union gdma_oob { + struct { + uint32_t num_padding_sgls:5; + uint32_t reserved1:19; + uint32_t last_vbytes:8; + uint32_t num_sgl_entries:8; + uint32_t inline_client_oob_size:3; + uint32_t client_oob_in_sgl:1; + uint32_t consume_credit:1; + uint32_t fence:1; + uint32_t reserved2:2; + uint32_t client_data_unit:14; + uint32_t check_sn:1; + uint32_t sgl_direct:1; + } tx; + struct { + uint32_t reserved1; + uint32_t num_sgl_entries:8; + uint32_t inline_client_oob_size:3; + uint32_t reserved2:19; + uint32_t check_sn:1; + uint32_t reserved3:1; + } rx; +}; /* HW DATA */ + +/* The 16-byte struct is part of the GDMA work queue entry (WQE). */ +struct gdma_sge { + uint64_t address; + uint32_t mem_key; + uint32_t size; +}; /* HW DATA */ + +struct rdma_recv_oob { + uint32_t psn_start:24; + uint32_t reserved1:8; + uint32_t psn_range:24; + uint32_t reserved2:8; +}; /* HW DATA */ + +struct extra_large_wqe { + __le32 immediate; + uint32_t reserved; + uint64_t padding; +}; /* HW DATA */ + +struct rdma_send_oob { + uint32_t wqe_type:5; + uint32_t fence:1; + uint32_t signaled:1; + uint32_t solicited:1; + uint32_t psn:24; + + uint32_t ssn:24; // also remote_qpn + uint32_t reserved1:8; + union { + uint32_t req_details[4]; + union { + __le32 immediate; + uint32_t invalidate_key; + } send; + struct { + uint32_t address_hi; + uint32_t address_low; + uint32_t rkey; + uint32_t dma_len; + } rdma; + }; +}; /* HW DATA */ + +struct gdma_wqe { + // in units of 32-byte blocks, masked by GDMA_QUEUE_OFFSET_MASK. + uint32_t unmasked_wqe_index; + uint32_t size_in_bu; + + // Client oob is either 8 bytes or 24 bytes, so DmaOob + ClientOob will never wrap. + union gdma_oob *gdma_oob; + void *client_oob; + uint32_t client_oob_size; + + struct gdma_sge *sgl1; + uint32_t num_sge1; + // In case SGL wraps in the queue buffer. + struct gdma_sge *sgl2; + uint32_t num_sge2; +}; + +enum wqe_opcode_types { + WQE_TYPE_UD_SEND = 0, + WQE_TYPE_UD_SEND_IMM = 1, + WQE_TYPE_RC_SEND = 2, + WQE_TYPE_RC_SEND_IMM = 3, + WQE_TYPE_RC_SEND_INV = 4, + WQE_TYPE_WRITE = 5, + WQE_TYPE_WRITE_IMM = 6, + WQE_TYPE_READ = 7, + WQE_TYPE_UD_RECV = 8, + WQE_TYPE_RC_RECV = 9, + WQE_TYPE_LOCAL_INV = 10, + WQE_TYPE_REG_MR = 11, + WQE_TYPE_MAX, +}; /* HW DATA */ + +static inline enum wqe_opcode_types + convert_wr_to_hw_opcode(enum ibv_wr_opcode opcode) +{ + switch (opcode) { + case IBV_WR_RDMA_WRITE: + return WQE_TYPE_WRITE; + case IBV_WR_RDMA_WRITE_WITH_IMM: + return WQE_TYPE_WRITE_IMM; + case IBV_WR_SEND: + return WQE_TYPE_RC_SEND; + case IBV_WR_SEND_WITH_IMM: + return WQE_TYPE_RC_SEND_IMM; + case IBV_WR_RDMA_READ: + return WQE_TYPE_READ; + default: + return WQE_TYPE_MAX; + } +} + +enum { + CQE_TYPE_NOP = 0, + CQE_TYPE_UD_SEND = 1, + CQE_TYPE_UD_SEND_IMM = 2, + CQE_TYPE_RC_SEND = 3, + CQE_TYPE_RC_SEND_IMM = 4, + CQE_TYPE_RC_SEND_INV = 5, + CQE_TYPE_RC_WRITE_IMM = 6, + CQE_TYPE_ARMED_CMPL = 7, + CQE_TYPE_LWR = 8, + CQE_TYPE_RC_FENCE = 9, + CQE_TYPE_MAX +}; /* HW DATA */ + +struct mana_rdma_cqe { + uint32_t cqe_type : 8; + uint32_t vendor_error : 8; + uint32_t reserved1 : 16; + union { + uint32_t data[GDMA_COMP_DATA_SIZE / sizeof(uint32_t) - 4]; + struct { + uint32_t msg_len; + uint32_t psn : 24; + uint32_t reserved : 8; + uint32_t imm_data; + uint32_t rx_wqe_offset; + } rc_recv; + struct { + uint32_t sge_offset : 5; + uint32_t rx_wqe_offset : 27; + uint32_t sge_byte_offset; + } ud_send; + struct { + uint32_t msg_len; + uint32_t src_qpn : 24; + uint32_t reserved : 8; + uint32_t imm_data; + uint32_t rx_wqe_offset; + } ud_recv; + + struct { + uint32_t reserved1; + uint32_t psn : 24; + uint32_t reserved2 : 8; + uint32_t imm_data; + uint32_t rx_wqe_offset; + } rc_write_with_imm; + struct { + uint32_t msn : 24; + uint32_t syndrome : 8; + uint32_t psn : 24; + uint32_t reserved : 8; + uint32_t read_resp_psn : 24; + } rc_armed_completion; + }; + uint32_t timestamp_hi; + uint32_t timestamp_lo; + uint32_t reserved3; +}; /* HW DATA */ + +struct gdma_cqe { + union { + uint8_t data[GDMA_COMP_DATA_SIZE]; + struct mana_rdma_cqe rdma_cqe; + }; + uint32_t wqid : 24; + uint32_t is_sq : 1; + uint32_t reserved : 4; + uint32_t owner_bits : 3; +}; /* HW DATA */ + +#endif //_GDMA_H_ diff --git a/providers/mana/mana.c b/providers/mana/mana.c index 90aabd9d5..efd5379f7 100644 --- a/providers/mana/mana.c +++ b/providers/mana/mana.c @@ -25,9 +25,6 @@ DECLARE_DRV_CMD(mana_alloc_ucontext, IB_USER_VERBS_CMD_GET_CONTEXT, empty, DECLARE_DRV_CMD(mana_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD, empty, empty); -DECLARE_DRV_CMD(mana_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, mana_ib_create_cq, - empty); - static const struct verbs_match_ent hca_table[] = { VERBS_DRIVER_ID(RDMA_DRIVER_MANA), {}, @@ -38,6 +35,18 @@ struct mana_context *to_mctx(struct ibv_context *ibctx) return container_of(ibctx, struct mana_context, ibv_ctx.context); } +void *mana_alloc_mem(uint32_t size) +{ + void *buf; + + buf = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (buf == MAP_FAILED) + return NULL; + return buf; +} + int mana_query_device_ex(struct ibv_context *context, const struct ibv_query_device_ex_input *input, struct ibv_device_attr_ex *attr, size_t attr_size) @@ -189,122 +198,16 @@ int mana_dereg_mr(struct verbs_mr *vmr) return 0; } -struct ibv_cq *mana_create_cq(struct ibv_context *context, int cqe, - struct ibv_comp_channel *channel, int comp_vector) -{ - struct mana_context *ctx = to_mctx(context); - struct mana_cq *cq; - struct mana_create_cq cmd = {}; - struct mana_create_cq_resp resp = {}; - struct mana_ib_create_cq *cmd_drv; - int cq_size; - int ret; - - if (!ctx->extern_alloc.alloc || !ctx->extern_alloc.free) { - /* - * This version of driver doesn't support allocating buffers - * in rdma-core. - */ - verbs_err(verbs_get_ctx(context), - "Allocating core buffers for CQ is not supported\n"); - errno = EINVAL; - return NULL; - } - - cq = calloc(1, sizeof(*cq)); - if (!cq) - return NULL; - - cq_size = cqe * COMP_ENTRY_SIZE; - cq_size = roundup_pow_of_two(cq_size); - cq_size = align(cq_size, MANA_PAGE_SIZE); - - cq->buf = ctx->extern_alloc.alloc(cq_size, ctx->extern_alloc.data); - if (!cq->buf) { - errno = ENOMEM; - goto free_cq; - } - cq->cqe = cqe; - - cmd_drv = &cmd.drv_payload; - cmd_drv->buf_addr = (uintptr_t)cq->buf; - - ret = ibv_cmd_create_cq(context, cq->cqe, channel, comp_vector, - &cq->ibcq, &cmd.ibv_cmd, sizeof(cmd), - &resp.ibv_resp, sizeof(resp)); - - if (ret) { - verbs_err(verbs_get_ctx(context), "Failed to Create CQ\n"); - ctx->extern_alloc.free(cq->buf, ctx->extern_alloc.data); - errno = ret; - goto free_cq; - } - - return &cq->ibcq; - -free_cq: - free(cq); - return NULL; -} - -int mana_destroy_cq(struct ibv_cq *ibcq) -{ - int ret; - struct mana_cq *cq = container_of(ibcq, struct mana_cq, ibcq); - struct mana_context *ctx = to_mctx(ibcq->context); - - if (!ctx->extern_alloc.free) { - /* - * This version of driver doesn't support allocating buffers - * in rdma-core. It's not possible to reach the code here. - */ - verbs_err(verbs_get_ctx(ibcq->context), - "Invalid external context in destroy CQ\n"); - return -EINVAL; - } - - ret = ibv_cmd_destroy_cq(ibcq); - if (ret) { - verbs_err(verbs_get_ctx(ibcq->context), - "Failed to Destroy CQ\n"); - return ret; - } - - ctx->extern_alloc.free(cq->buf, ctx->extern_alloc.data); - free(cq); - - return ret; -} - -static int mana_poll_cq(struct ibv_cq *ibcq, int nwc, struct ibv_wc *wc) -{ - /* This version of driver supports RAW QP only. - * Polling CQ is done directly in the application. - */ - return EOPNOTSUPP; -} - -static int mana_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, - struct ibv_recv_wr **bad) -{ - /* This version of driver supports RAW QP only. - * Posting WR is done directly in the application. - */ - return EOPNOTSUPP; -} - -static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, - struct ibv_send_wr **bad) -{ - /* This version of driver supports RAW QP only. - * Posting WR is done directly in the application. - */ - return EOPNOTSUPP; -} - static void mana_free_context(struct ibv_context *ibctx) { struct mana_context *context = to_mctx(ibctx); + int i; + + for (i = 0; i < MANA_QP_TABLE_SIZE; ++i) { + if (context->qp_table[i].refcnt) + free(context->qp_table[i].table); + } + pthread_mutex_destroy(&context->qp_table_mutex); munmap(context->db_page, DOORBELL_PAGE_SIZE); verbs_uninit_context(&context->ibv_ctx); @@ -334,6 +237,7 @@ static const struct verbs_context_ops mana_ctx_ops = { .query_device_ex = mana_query_device_ex, .query_port = mana_query_port, .reg_mr = mana_reg_mr, + .req_notify_cq = mana_arm_cq, }; static struct verbs_device *mana_device_alloc(struct verbs_sysfs_dev *sysfs_dev) @@ -358,7 +262,7 @@ static void mana_uninit_device(struct verbs_device *verbs_device) static struct verbs_context *mana_alloc_context(struct ibv_device *ibdev, int cmd_fd, void *private_data) { - int ret; + int ret, i; struct mana_context *context; struct mana_alloc_ucontext_resp resp; struct ibv_get_context cmd; @@ -378,6 +282,10 @@ static struct verbs_context *mana_alloc_context(struct ibv_device *ibdev, verbs_set_ops(&context->ibv_ctx, &mana_ctx_ops); + pthread_mutex_init(&context->qp_table_mutex, NULL); + for (i = 0; i < MANA_QP_TABLE_SIZE; ++i) + context->qp_table[i].refcnt = 0; + context->db_page = mmap(NULL, DOORBELL_PAGE_SIZE, PROT_WRITE, MAP_SHARED, context->ibv_ctx.context.cmd_fd, 0); if (context->db_page == MAP_FAILED) { diff --git a/providers/mana/mana.h b/providers/mana/mana.h index 50d747314..c0ac97bfa 100644 --- a/providers/mana/mana.h +++ b/providers/mana/mana.h @@ -7,6 +7,8 @@ #define _MANA_H_ #include "manadv.h" +#include +#include "shadow_queue.h" #define COMP_ENTRY_SIZE 64 #define MANA_IB_TOEPLITZ_HASH_KEY_SIZE_IN_BYTES 40 @@ -24,31 +26,59 @@ #define DOORBELL_PAGE_SIZE 4096 #define MANA_PAGE_SIZE 4096 -static inline int align_next_power2(int size) -{ - int val = 1; - - while (val < size) - val <<= 1; +#define MANA_QP_TABLE_SIZE 4096 +#define MANA_QP_TABLE_SHIFT 12 +#define MANA_QP_TABLE_MASK (MANA_QP_TABLE_SIZE - 1) + +/* PSN 24 bit arithmetic comparisons */ +#define PSN_MASK 0xFFFFFF +#define PSN_SIGN_BIT 0x800000 +#define PSN_GE(PSN1, PSN2) ((((PSN1) - (PSN2)) & PSN_SIGN_BIT) == 0) +#define PSN_GT(PSN1, PSN2) PSN_GE(PSN1, (PSN2) + 1) +#define PSN_LE(PSN1, PSN2) PSN_GE(PSN2, PSN1) +#define PSN_LT(PSN1, PSN2) PSN_GT(PSN2, PSN1) +#define MTU_SIZE(MTU) (1U << ((MTU) + 7)) +#define PSN_DELTA(MSG_SIZE, MTU) max(1U, ((MSG_SIZE) + MTU_SIZE(MTU) - 1) >> (MTU + 7)) +#define PSN_DEC(PSN) (((PSN) - 1) & PSN_MASK) +#define PSN_INC(PSN) (((PSN) + 1) & PSN_MASK) +#define PSN_ADD(PSN, DELTA) (((PSN) + (DELTA)) & PSN_MASK) + +enum user_queue_types { + USER_RC_SEND_QUEUE_REQUESTER = 0, + USER_RC_SEND_QUEUE_RESPONDER = 1, + USER_RC_RECV_QUEUE_REQUESTER = 2, + USER_RC_RECV_QUEUE_RESPONDER = 3, + USER_RC_QUEUE_TYPE_MAX = 4, +}; - return val; +static inline uint32_t align_hw_size(uint32_t size) +{ + size = roundup_pow_of_two(size); + return align(size, MANA_PAGE_SIZE); } -static inline int align_hw_size(int size) +static inline uint32_t get_wqe_size(uint32_t sge) { - size = align(size, MANA_PAGE_SIZE); - return align_next_power2(size); + uint32_t wqe_size = sge * SGE_SIZE + DMA_OOB_SIZE + INLINE_OOB_SMALL_SIZE; + + return align(wqe_size, GDMA_WQE_ALIGNMENT_UNIT_SIZE); } -static inline int get_wqe_size(int sge) +static inline uint32_t get_large_wqe_size(uint32_t sge) { - int wqe_size = sge * SGE_SIZE + DMA_OOB_SIZE + INLINE_OOB_SMALL_SIZE; + uint32_t wqe_size = sge * SGE_SIZE + DMA_OOB_SIZE + INLINE_OOB_LARGE_SIZE; return align(wqe_size, GDMA_WQE_ALIGNMENT_UNIT_SIZE); } struct mana_context { struct verbs_context ibv_ctx; + struct { + struct mana_qp **table; + int refcnt; + } qp_table[MANA_QP_TABLE_SIZE]; + pthread_mutex_t qp_table_mutex; + struct manadv_ctx_allocators extern_alloc; void *db_page; }; @@ -60,18 +90,51 @@ struct mana_rwq_ind_table { struct ibv_wq **ind_tbl; }; -struct mana_qp { - struct verbs_qp ibqp; +struct mana_gdma_queue { + uint32_t id; + uint32_t size; + uint32_t prod_idx; + uint32_t cons_idx; + void *db_page; + void *buffer; +}; + +struct mana_ib_raw_qp { void *send_buf; uint32_t send_buf_size; - int send_wqe_count; - uint32_t sqid; uint32_t tx_vp_offset; }; +struct mana_ib_rc_qp { + struct mana_gdma_queue queues[USER_RC_QUEUE_TYPE_MAX]; + + uint32_t sq_ssn; + uint32_t sq_psn; + uint32_t sq_highest_completed_psn; +}; + +struct mana_qp { + struct verbs_qp ibqp; + pthread_spinlock_t sq_lock; + pthread_spinlock_t rq_lock; + + union { + struct mana_ib_raw_qp raw_qp; + struct mana_ib_rc_qp rc_qp; + }; + + enum ibv_mtu mtu; + + struct shadow_queue shadow_rq; + struct shadow_queue shadow_sq; + + struct list_node send_cq_node; + struct list_node recv_cq_node; +}; + struct mana_wq { struct ibv_wq ibwq; @@ -87,9 +150,19 @@ struct mana_wq { struct mana_cq { struct ibv_cq ibcq; uint32_t cqe; + uint32_t cqid; void *buf; - uint32_t cqid; + pthread_spinlock_t lock; + uint32_t head; + uint32_t last_armed_head; + uint32_t ready_wcs; + void *db_page; + /* list of qp's that use this cq for send completions */ + struct list_head send_qp_list; + /* list of qp's that use this cq for recv completions */ + struct list_head recv_qp_list; + bool buf_external; }; struct mana_device { @@ -108,6 +181,8 @@ struct mana_parent_domain { struct mana_context *to_mctx(struct ibv_context *ibctx); +void *mana_alloc_mem(uint32_t size); + int mana_query_device_ex(struct ibv_context *context, const struct ibv_query_device_ex_input *input, struct ibv_device_attr_ex *attr, size_t attr_size); @@ -133,6 +208,8 @@ struct ibv_cq *mana_create_cq(struct ibv_context *context, int cqe, int mana_destroy_cq(struct ibv_cq *cq); +int mana_poll_cq(struct ibv_cq *ibcq, int nwc, struct ibv_wc *wc); + struct ibv_wq *mana_create_wq(struct ibv_context *context, struct ibv_wq_init_attr *attr); @@ -154,4 +231,13 @@ int mana_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); int mana_destroy_qp(struct ibv_qp *ibqp); +int mana_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad); + +int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad); + +int mana_arm_cq(struct ibv_cq *ibcq, int solicited); + +struct mana_qp *mana_get_qp_from_rq(struct mana_context *ctx, uint32_t qpn); #endif diff --git a/providers/mana/manadv.c b/providers/mana/manadv.c index 3fcd52335..4b40d05bb 100644 --- a/providers/mana/manadv.c +++ b/providers/mana/manadv.c @@ -53,11 +53,11 @@ int manadv_init_obj(struct manadv_obj *obj, uint64_t obj_type) struct ibv_context *context = ibqp->context; struct mana_context *ctx = to_mctx(context); - obj->qp.out->sq_buf = qp->send_buf; - obj->qp.out->sq_count = qp->send_wqe_count; - obj->qp.out->sq_size = qp->send_buf_size; - obj->qp.out->sq_id = qp->sqid; - obj->qp.out->tx_vp_offset = qp->tx_vp_offset; + obj->qp.out->sq_buf = qp->raw_qp.send_buf; + obj->qp.out->sq_count = qp->raw_qp.send_wqe_count; + obj->qp.out->sq_size = qp->raw_qp.send_buf_size; + obj->qp.out->sq_id = qp->raw_qp.sqid; + obj->qp.out->tx_vp_offset = qp->raw_qp.tx_vp_offset; obj->qp.out->db_page = ctx->db_page; } diff --git a/providers/mana/qp.c b/providers/mana/qp.c index 328ce419d..596ab75af 100644 --- a/providers/mana/qp.c +++ b/providers/mana/qp.c @@ -19,6 +19,8 @@ #include #include "mana.h" +#include "rollback.h" +#include "doorbells.h" DECLARE_DRV_CMD(mana_create_qp, IB_USER_VERBS_CMD_CREATE_QP, mana_ib_create_qp, mana_ib_create_qp_resp); @@ -26,6 +28,9 @@ DECLARE_DRV_CMD(mana_create_qp, IB_USER_VERBS_CMD_CREATE_QP, mana_ib_create_qp, DECLARE_DRV_CMD(mana_create_qp_ex, IB_USER_VERBS_EX_CMD_CREATE_QP, mana_ib_create_qp_rss, mana_ib_create_qp_rss_resp); +DECLARE_DRV_CMD(mana_create_rc_qp, IB_USER_VERBS_CMD_CREATE_QP, + mana_ib_create_rc_qp, mana_ib_create_rc_qp_resp); + static struct ibv_qp *mana_create_qp_raw(struct ibv_pd *ibpd, struct ibv_qp_init_attr *attr) { @@ -67,13 +72,13 @@ static struct ibv_qp *mana_create_qp_raw(struct ibv_pd *ibpd, if (!qp) return NULL; - qp->send_buf_size = + qp->raw_qp.send_buf_size = attr->cap.max_send_wr * get_wqe_size(attr->cap.max_send_sge); - qp->send_buf_size = align_hw_size(qp->send_buf_size); + qp->raw_qp.send_buf_size = align_hw_size(qp->raw_qp.send_buf_size); - qp->send_buf = ctx->extern_alloc.alloc(qp->send_buf_size, - ctx->extern_alloc.data); - if (!qp->send_buf) { + qp->raw_qp.send_buf = ctx->extern_alloc.alloc(qp->raw_qp.send_buf_size, + ctx->extern_alloc.data); + if (!qp->raw_qp.send_buf) { errno = ENOMEM; goto free_qp; } @@ -81,8 +86,8 @@ static struct ibv_qp *mana_create_qp_raw(struct ibv_pd *ibpd, qp_cmd_drv = &qp_cmd.drv_payload; qp_resp_drv = &qp_resp.drv_payload; - qp_cmd_drv->sq_buf_addr = (uintptr_t)qp->send_buf; - qp_cmd_drv->sq_buf_size = qp->send_buf_size; + qp_cmd_drv->sq_buf_addr = (uintptr_t)qp->raw_qp.send_buf; + qp_cmd_drv->sq_buf_size = qp->raw_qp.send_buf_size; qp_cmd_drv->port = port; ret = ibv_cmd_create_qp(ibpd, &qp->ibqp.qp, attr, &qp_cmd.ibv_cmd, @@ -90,14 +95,14 @@ static struct ibv_qp *mana_create_qp_raw(struct ibv_pd *ibpd, sizeof(qp_resp)); if (ret) { verbs_err(verbs_get_ctx(ibpd->context), "Create QP failed\n"); - ctx->extern_alloc.free(qp->send_buf, ctx->extern_alloc.data); + ctx->extern_alloc.free(qp->raw_qp.send_buf, ctx->extern_alloc.data); errno = ret; goto free_qp; } - qp->sqid = qp_resp_drv->sqid; - qp->tx_vp_offset = qp_resp_drv->tx_vp_offset; - qp->send_wqe_count = attr->cap.max_send_wr; + qp->raw_qp.sqid = qp_resp_drv->sqid; + qp->raw_qp.tx_vp_offset = qp_resp_drv->tx_vp_offset; + qp->raw_qp.send_wqe_count = attr->cap.max_send_wr; cq->cqid = qp_resp_drv->cqid; @@ -108,12 +113,217 @@ static struct ibv_qp *mana_create_qp_raw(struct ibv_pd *ibpd, return NULL; } +static int mana_store_qp(struct mana_context *ctx, struct mana_qp *qp, uint32_t qid) +{ + uint32_t tbl_idx, tbl_off; + int ret = 0; + + pthread_mutex_lock(&ctx->qp_table_mutex); + + tbl_idx = qid >> MANA_QP_TABLE_SHIFT; + tbl_off = qid & MANA_QP_TABLE_MASK; + + if (ctx->qp_table[tbl_idx].refcnt == 0) { + ctx->qp_table[tbl_idx].table = + calloc(MANA_QP_TABLE_SIZE, sizeof(struct mana_qp *)); + if (!ctx->qp_table[tbl_idx].table) { + ret = ENOMEM; + goto out; + } + } + + if (ctx->qp_table[tbl_idx].table[tbl_off]) { + ret = EBUSY; + goto out; + } + + ctx->qp_table[tbl_idx].table[tbl_off] = qp; + ctx->qp_table[tbl_idx].refcnt++; + +out: + pthread_mutex_unlock(&ctx->qp_table_mutex); + return ret; +} + +static void mana_remove_qp(struct mana_context *ctx, uint32_t qid) +{ + uint32_t tbl_idx, tbl_off; + + pthread_mutex_lock(&ctx->qp_table_mutex); + tbl_idx = qid >> MANA_QP_TABLE_SHIFT; + tbl_off = qid & MANA_QP_TABLE_MASK; + + ctx->qp_table[tbl_idx].table[tbl_off] = NULL; + ctx->qp_table[tbl_idx].refcnt--; + + if (ctx->qp_table[tbl_idx].refcnt == 0) { + free(ctx->qp_table[tbl_idx].table); + ctx->qp_table[tbl_idx].table = NULL; + } + + pthread_mutex_unlock(&ctx->qp_table_mutex); +} + +struct mana_qp *mana_get_qp_from_rq(struct mana_context *ctx, uint32_t qid) +{ + uint32_t tbl_idx, tbl_off; + + tbl_idx = qid >> MANA_QP_TABLE_SHIFT; + tbl_off = qid & MANA_QP_TABLE_MASK; + + if (!ctx->qp_table[tbl_idx].table) + return NULL; + + return ctx->qp_table[tbl_idx].table[tbl_off]; +} + +static uint32_t get_queue_size(struct ibv_qp_init_attr *attr, enum user_queue_types type) +{ + uint32_t size = 0; + + if (attr->qp_type == IBV_QPT_RC) { + switch (type) { + case USER_RC_SEND_QUEUE_REQUESTER: + /* For write with imm we need +1 */ + size = attr->cap.max_send_wr * get_large_wqe_size(attr->cap.max_send_sge + 1); + break; + case USER_RC_SEND_QUEUE_RESPONDER: + size = MANA_PAGE_SIZE; + break; + case USER_RC_RECV_QUEUE_REQUESTER: + size = MANA_PAGE_SIZE; + break; + case USER_RC_RECV_QUEUE_RESPONDER: + size = attr->cap.max_recv_wr * get_wqe_size(attr->cap.max_recv_sge); + break; + default: + return 0; + } + } + + size = align_hw_size(size); + + if (attr->qp_type == IBV_QPT_RC && type == USER_RC_SEND_QUEUE_REQUESTER) + size += sizeof(struct mana_ib_rollback_shared_mem); + + return size; +} + +static struct ibv_qp *mana_create_qp_rc(struct ibv_pd *ibpd, + struct ibv_qp_init_attr *attr) +{ + struct mana_cq *send_cq = container_of(attr->send_cq, struct mana_cq, ibcq); + struct mana_cq *recv_cq = container_of(attr->recv_cq, struct mana_cq, ibcq); + struct mana_context *ctx = to_mctx(ibpd->context); + struct mana_ib_create_rc_qp_resp *qp_resp_drv; + struct mana_create_rc_qp_resp qp_resp = {}; + struct mana_ib_create_rc_qp *qp_cmd_drv; + struct mana_create_rc_qp qp_cmd = {}; + struct mana_qp *qp; + int ret, i; + + qp = calloc(1, sizeof(*qp)); + if (!qp) + return NULL; + + qp_cmd_drv = &qp_cmd.drv_payload; + qp_resp_drv = &qp_resp.drv_payload; + + pthread_spin_init(&qp->sq_lock, PTHREAD_PROCESS_PRIVATE); + pthread_spin_init(&qp->rq_lock, PTHREAD_PROCESS_PRIVATE); + + if (create_shadow_queue(&qp->shadow_sq, attr->cap.max_send_wr, + sizeof(struct rc_sq_shadow_wqe))) { + verbs_err(verbs_get_ctx(ibpd->context), "Failed to alloc sq shadow queue\n"); + errno = ENOMEM; + goto free_qp; + } + + if (create_shadow_queue(&qp->shadow_rq, attr->cap.max_recv_wr, + sizeof(struct rc_rq_shadow_wqe))) { + verbs_err(verbs_get_ctx(ibpd->context), "Failed to alloc rc shadow queue\n"); + errno = ENOMEM; + goto destroy_shadow_sq; + } + + for (i = 0; i < USER_RC_QUEUE_TYPE_MAX; ++i) { + qp->rc_qp.queues[i].db_page = ctx->db_page; + qp->rc_qp.queues[i].size = get_queue_size(attr, i); + qp->rc_qp.queues[i].buffer = mana_alloc_mem(qp->rc_qp.queues[i].size); + + if (!qp->rc_qp.queues[i].buffer) { + verbs_err(verbs_get_ctx(ibpd->context), + "Failed to allocate memory for RC queue %d\n", i); + errno = ENOMEM; + goto destroy_queues; + } + + qp_cmd_drv->queue_buf[i] = (uintptr_t)qp->rc_qp.queues[i].buffer; + qp_cmd_drv->queue_size[i] = qp->rc_qp.queues[i].size; + } + + mana_ib_init_rb_shmem(qp); + + ret = ibv_cmd_create_qp(ibpd, &qp->ibqp.qp, attr, &qp_cmd.ibv_cmd, + sizeof(qp_cmd), &qp_resp.ibv_resp, + sizeof(qp_resp)); + if (ret) { + verbs_err(verbs_get_ctx(ibpd->context), "Create QP failed\n"); + errno = ret; + goto free_rb; + } + + for (i = 0; i < USER_RC_QUEUE_TYPE_MAX; ++i) + qp->rc_qp.queues[i].id = qp_resp_drv->queue_id[i]; + + qp->ibqp.qp.qp_num = qp->rc_qp.queues[USER_RC_RECV_QUEUE_RESPONDER].id; + + ret = mana_store_qp(ctx, qp, qp->rc_qp.queues[USER_RC_RECV_QUEUE_REQUESTER].id); + if (ret) { + errno = ret; + goto destroy_qp; + } + ret = mana_store_qp(ctx, qp, qp->rc_qp.queues[USER_RC_RECV_QUEUE_RESPONDER].id); + if (ret) { + errno = ret; + goto remove_qp_req; + } + + pthread_spin_lock(&send_cq->lock); + list_add(&send_cq->send_qp_list, &qp->send_cq_node); + pthread_spin_unlock(&send_cq->lock); + + pthread_spin_lock(&recv_cq->lock); + list_add(&recv_cq->recv_qp_list, &qp->recv_cq_node); + pthread_spin_unlock(&recv_cq->lock); + + return &qp->ibqp.qp; + +remove_qp_req: + mana_remove_qp(ctx, qp->rc_qp.queues[USER_RC_RECV_QUEUE_REQUESTER].id); +destroy_qp: + ibv_cmd_destroy_qp(&qp->ibqp.qp); +free_rb: + mana_ib_deinit_rb_shmem(qp); +destroy_queues: + while (i-- > 0) + munmap(qp->rc_qp.queues[i].buffer, qp->rc_qp.queues[i].size); + destroy_shadow_queue(&qp->shadow_rq); +destroy_shadow_sq: + destroy_shadow_queue(&qp->shadow_sq); +free_qp: + free(qp); + return NULL; +} + struct ibv_qp *mana_create_qp(struct ibv_pd *ibpd, struct ibv_qp_init_attr *attr) { switch (attr->qp_type) { case IBV_QPT_RAW_PACKET: return mana_create_qp_raw(ibpd, attr); + case IBV_QPT_RC: + return mana_create_qp_rc(ibpd, attr); default: verbs_err(verbs_get_ctx(ibpd->context), "QP type %u is not supported\n", attr->qp_type); @@ -123,25 +333,92 @@ struct ibv_qp *mana_create_qp(struct ibv_pd *ibpd, return NULL; } -int mana_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) +static void mana_ib_modify_rc_qp(struct mana_qp *qp, struct ibv_qp_attr *attr, int attr_mask) +{ + int i; + + if (attr_mask & IBV_QP_PATH_MTU) + qp->mtu = attr->path_mtu; + + switch (attr->qp_state) { + case IBV_QPS_RESET: + case IBV_QPS_INIT: + for (i = 0; i < USER_RC_QUEUE_TYPE_MAX; ++i) { + qp->rc_qp.queues[i].prod_idx = 0; + qp->rc_qp.queues[i].cons_idx = 0; + } + mana_ib_reset_rb_shmem(qp); + reset_shadow_queue(&qp->shadow_rq); + break; + case IBV_QPS_RTR: + break; + case IBV_QPS_RTS: + reset_shadow_queue(&qp->shadow_sq); + qp->rc_qp.sq_ssn = 1; + qp->rc_qp.sq_psn = attr->sq_psn; + qp->rc_qp.sq_highest_completed_psn = PSN_DEC(attr->sq_psn); + gdma_arm_normal_cqe(&qp->rc_qp.queues[USER_RC_RECV_QUEUE_REQUESTER], attr->sq_psn); + break; + default: + break; + } +} + +int mana_modify_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, int attr_mask) { - return EOPNOTSUPP; + struct mana_qp *qp = container_of(ibqp, struct mana_qp, ibqp.qp); + struct ibv_modify_qp cmd = {}; + int err; + + if (ibqp->qp_type != IBV_QPT_RC) + return EOPNOTSUPP; + + if (!(attr_mask & IBV_QP_STATE)) + return 0; + + err = ibv_cmd_modify_qp(ibqp, attr, attr_mask, &cmd, sizeof(cmd)); + if (err) { + verbs_err(verbs_get_ctx(ibqp->context), "Failed to modify qp\n"); + return err; + } + + mana_ib_modify_rc_qp(qp, attr, attr_mask); + + return 0; +} + +static void mana_drain_cqes(struct mana_qp *qp) +{ + struct mana_cq *send_cq = container_of(qp->ibqp.qp.send_cq, struct mana_cq, ibcq); + struct mana_cq *recv_cq = container_of(qp->ibqp.qp.recv_cq, struct mana_cq, ibcq); + + pthread_spin_lock(&send_cq->lock); + while (shadow_queue_get_next_to_consume(&qp->shadow_sq)) { + shadow_queue_advance_consumer(&qp->shadow_sq); + send_cq->ready_wcs--; + } + list_del(&qp->send_cq_node); + pthread_spin_unlock(&send_cq->lock); + + pthread_spin_lock(&recv_cq->lock); + while (shadow_queue_get_next_to_consume(&qp->shadow_rq)) { + shadow_queue_advance_consumer(&qp->shadow_rq); + recv_cq->ready_wcs--; + } + list_del(&qp->recv_cq_node); + pthread_spin_unlock(&recv_cq->lock); } int mana_destroy_qp(struct ibv_qp *ibqp) { - int ret; struct mana_qp *qp = container_of(ibqp, struct mana_qp, ibqp.qp); struct mana_context *ctx = to_mctx(ibqp->context); + int ret, i; - if (!ctx->extern_alloc.free) { - /* - * This version of driver doesn't support allocating buffers - * in rdma-core. - */ - verbs_err(verbs_get_ctx(ibqp->context), - "Invalid context in Destroy QP\n"); - return -EINVAL; + if (ibqp->qp_type == IBV_QPT_RC) { + mana_remove_qp(ctx, qp->rc_qp.queues[USER_RC_RECV_QUEUE_REQUESTER].id); + mana_remove_qp(ctx, qp->rc_qp.queues[USER_RC_RECV_QUEUE_RESPONDER].id); + mana_drain_cqes(qp); } ret = ibv_cmd_destroy_qp(ibqp); @@ -150,7 +427,24 @@ int mana_destroy_qp(struct ibv_qp *ibqp) return ret; } - ctx->extern_alloc.free(qp->send_buf, ctx->extern_alloc.data); + switch (ibqp->qp_type) { + case IBV_QPT_RAW_PACKET: + ctx->extern_alloc.free(qp->raw_qp.send_buf, ctx->extern_alloc.data); + break; + case IBV_QPT_RC: + pthread_spin_destroy(&qp->sq_lock); + pthread_spin_destroy(&qp->rq_lock); + destroy_shadow_queue(&qp->shadow_sq); + destroy_shadow_queue(&qp->shadow_rq); + mana_ib_deinit_rb_shmem(qp); + for (i = 0; i < USER_RC_QUEUE_TYPE_MAX; ++i) + munmap(qp->rc_qp.queues[i].buffer, qp->rc_qp.queues[i].size); + break; + default: + verbs_err(verbs_get_ctx(ibqp->context), + "QP type %u is not supported\n", ibqp->qp_type); + errno = EINVAL; + } free(qp); return 0; diff --git a/providers/mana/rollback.h b/providers/mana/rollback.h new file mode 100644 index 000000000..e91b7eeff --- /dev/null +++ b/providers/mana/rollback.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#ifndef _ROLLBACK_H_ +#define _ROLLBACK_H_ + +#include +#include +#include +#include +#include +#include "mana.h" + +#define MAKE_TAG(a, b, c, d) (((uint32_t)(d) << 24) | ((c) << 16) | ((b) << 8) | (a)) +#define RNIC_ROLLBACK_SHARED_MEM_SIG MAKE_TAG('R', 'L', 'B', 'K') + +struct mana_ib_rollback_shared_mem { + uint32_t signature; + uint32_t size; + + _Atomic(uint32_t) left_offset; + _Atomic(uint32_t) right_offset; +}; + +static inline struct mana_ib_rollback_shared_mem + *mana_ib_get_rollback_sh_mem(struct mana_qp *qp) +{ + struct mana_ib_rollback_shared_mem *rb_shmem; + struct mana_gdma_queue *req_sq = + &qp->rc_qp.queues[USER_RC_SEND_QUEUE_REQUESTER]; + + rb_shmem = (struct mana_ib_rollback_shared_mem *) + ((uint8_t *)req_sq->buffer + req_sq->size); + + return rb_shmem; +} + +static inline void mana_ib_init_rb_shmem(struct mana_qp *qp) +{ + // take some bytes for rollback memory + struct mana_gdma_queue *req_sq = + &qp->rc_qp.queues[USER_RC_SEND_QUEUE_REQUESTER]; + req_sq->size -= sizeof(struct mana_ib_rollback_shared_mem); + + struct mana_ib_rollback_shared_mem *rb_shmem = + mana_ib_get_rollback_sh_mem(qp); + + memset(rb_shmem, 0, sizeof(*rb_shmem)); + rb_shmem->signature = RNIC_ROLLBACK_SHARED_MEM_SIG; + rb_shmem->size = sizeof(struct mana_ib_rollback_shared_mem); +} + +static inline void mana_ib_deinit_rb_shmem(struct mana_qp *qp) +{ + // return back bytes for rollback memory + struct mana_gdma_queue *req_sq = + &qp->rc_qp.queues[USER_RC_SEND_QUEUE_REQUESTER]; + req_sq->size += sizeof(struct mana_ib_rollback_shared_mem); +} + +static inline void mana_ib_reset_rb_shmem(struct mana_qp *qp) +{ + struct mana_ib_rollback_shared_mem *rb_shmem = + mana_ib_get_rollback_sh_mem(qp); + + atomic_store(&rb_shmem->right_offset, 0); + atomic_store(&rb_shmem->left_offset, 0); +} + +static inline void mana_ib_update_shared_mem_right_offset(struct mana_qp *qp, uint32_t offset_in_bu) +{ + struct mana_ib_rollback_shared_mem *rb_shmem = + mana_ib_get_rollback_sh_mem(qp); + + atomic_store(&rb_shmem->right_offset, offset_in_bu); +} + +static inline void mana_ib_update_shared_mem_left_offset(struct mana_qp *qp, uint32_t offset_in_bu) +{ + struct mana_ib_rollback_shared_mem *rb_shmem = + mana_ib_get_rollback_sh_mem(qp); + + atomic_store(&rb_shmem->left_offset, offset_in_bu); +} + +#endif //_ROLLBACK_H_ diff --git a/providers/mana/shadow_queue.h b/providers/mana/shadow_queue.h new file mode 100644 index 000000000..6c86cdbfa --- /dev/null +++ b/providers/mana/shadow_queue.h @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#ifndef _SHADOW_QUEUE_H_ +#define _SHADOW_QUEUE_H_ + +#include +#include +#include +#include +#include +#include + +#define MANA_NO_SIGNAL_WC (0xff) + +struct shadow_wqe_header { + /* ibv_wc_opcode */ + uint8_t opcode; + /* ibv_wc_flags or MANA_NO_SIGNAL_WC */ + uint8_t flags; + /* ibv_wc_status */ + uint8_t vendor_error_code; + uint8_t posted_wqe_size_in_bu; + uint32_t unmasked_queue_offset; + uint64_t wr_id; +}; + +struct rc_sq_shadow_wqe { + struct shadow_wqe_header header; + uint32_t end_psn; + uint32_t read_posted_wqe_size_in_bu; +}; + +struct rc_rq_shadow_wqe { + struct shadow_wqe_header header; + uint32_t byte_len; + uint32_t imm_or_rkey; +}; + +struct shadow_queue { + uint64_t prod_idx; + uint64_t cons_idx; + uint64_t next_to_complete_idx; + uint32_t length; + uint32_t stride; + void *buffer; +}; + +static inline void reset_shadow_queue(struct shadow_queue *queue) +{ + queue->prod_idx = 0; + queue->cons_idx = 0; + queue->next_to_complete_idx = 0; +} + +static inline int create_shadow_queue(struct shadow_queue *queue, uint32_t length, uint32_t stride) +{ + length = roundup_pow_of_two(length); + stride = align(stride, 8); + + void *buffer = mmap(NULL, stride * length, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (buffer == MAP_FAILED) + return -1; + + queue->length = length; + queue->stride = stride; + reset_shadow_queue(queue); + queue->buffer = buffer; + return 0; +} + +static inline void destroy_shadow_queue(struct shadow_queue *queue) +{ + if (queue->buffer) { + munmap(queue->buffer, queue->stride * queue->length); + queue->buffer = NULL; + } +} + +static inline struct shadow_wqe_header * +shadow_queue_get_element(const struct shadow_queue *queue, uint64_t unmasked_index) +{ + uint32_t index = unmasked_index & (queue->length - 1); + + return (struct shadow_wqe_header *)((uint8_t *)queue->buffer + index * queue->stride); +} + +static inline bool shadow_queue_full(struct shadow_queue *queue) +{ + return (queue->prod_idx - queue->cons_idx) >= queue->length; +} + +static inline struct shadow_wqe_header * +shadow_queue_producer_entry(struct shadow_queue *queue) +{ + return shadow_queue_get_element(queue, queue->prod_idx); +} + +static inline void shadow_queue_advance_producer(struct shadow_queue *queue) +{ + queue->prod_idx++; +} + +static inline void shadow_queue_retreat_producer(struct shadow_queue *queue) +{ + queue->prod_idx--; +} + +static inline void shadow_queue_advance_consumer(struct shadow_queue *queue) +{ + queue->cons_idx++; +} + +static inline bool shadow_queue_empty(struct shadow_queue *queue) +{ + return queue->prod_idx == queue->cons_idx; +} + +static inline uint32_t shadow_queue_get_pending_wqe_count(struct shadow_queue *queue) +{ + return (uint32_t)(queue->prod_idx - queue->next_to_complete_idx); +} + +static inline struct shadow_wqe_header * +shadow_queue_get_next_to_consume(const struct shadow_queue *queue) +{ + if (queue->cons_idx == queue->next_to_complete_idx) + return NULL; + + return shadow_queue_get_element(queue, queue->cons_idx); +} + +static inline struct shadow_wqe_header * +shadow_queue_get_next_to_complete(struct shadow_queue *queue) +{ + if (queue->next_to_complete_idx == queue->prod_idx) + return NULL; + + return shadow_queue_get_element(queue, queue->next_to_complete_idx); +} + +static inline void shadow_queue_advance_next_to_complete(struct shadow_queue *queue) +{ + queue->next_to_complete_idx++; +} + +#endif //_SHADOW_QUEUE_H_ diff --git a/providers/mana/wr.c b/providers/mana/wr.c new file mode 100644 index 000000000..59759710b --- /dev/null +++ b/providers/mana/wr.c @@ -0,0 +1,416 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2024, Microsoft Corporation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "mana.h" +#include "doorbells.h" +#include "rollback.h" +#include "gdma.h" + +static inline void zero_wqe_content(struct gdma_wqe *wqe) +{ + memset(wqe->gdma_oob, 0, sizeof(union gdma_oob) + wqe->client_oob_size); + memset(wqe->sgl1, 0, wqe->num_sge1 * sizeof(struct gdma_sge)); + if (wqe->sgl2) + memset(wqe->sgl2, 0, wqe->num_sge2 * sizeof(struct gdma_sge)); +} + +static inline void gdma_advance_producer(struct mana_gdma_queue *wq, uint32_t size_in_bu) +{ + wq->prod_idx = (wq->prod_idx + size_in_bu) & GDMA_QUEUE_OFFSET_MASK; +} + +static inline int +gdma_get_current_wqe(struct mana_gdma_queue *wq, uint32_t client_oob_size, + uint32_t wqe_size, struct gdma_wqe *wqe) +{ + uint32_t wq_size = wq->size; + uint32_t used_entries = (wq->prod_idx - wq->cons_idx) & GDMA_QUEUE_OFFSET_MASK; + uint32_t free_space = wq_size - (used_entries * GDMA_WQE_ALIGNMENT_UNIT_SIZE); + + if (wqe_size > free_space) + return ENOMEM; + + uint32_t aligned_sgl_size = wqe_size - sizeof(union gdma_oob) - client_oob_size; + uint32_t total_num_sges = aligned_sgl_size / sizeof(struct gdma_sge); + uint32_t offset = (wq->prod_idx * GDMA_WQE_ALIGNMENT_UNIT_SIZE) & (wq_size - 1); + + wqe->unmasked_wqe_index = wq->prod_idx; + wqe->size_in_bu = wqe_size / GDMA_WQE_ALIGNMENT_UNIT_SIZE; + wqe->gdma_oob = (union gdma_oob *)((uint8_t *)wq->buffer + offset); + wqe->client_oob = ((uint8_t *)wqe->gdma_oob) + sizeof(union gdma_oob); + wqe->client_oob_size = client_oob_size; + + if (likely(wq_size - offset >= wqe_size)) { + wqe->sgl1 = (struct gdma_sge *)((uint8_t *)wqe->client_oob + client_oob_size); + wqe->num_sge1 = total_num_sges; + wqe->sgl2 = NULL; + wqe->num_sge2 = 0; + } else { + if (offset + sizeof(union gdma_oob) + client_oob_size == wq_size) { + wqe->sgl1 = (struct gdma_sge *)wq->buffer; + wqe->num_sge1 = total_num_sges; + wqe->sgl2 = NULL; + wqe->num_sge2 = 0; + } else { + wqe->sgl1 = (struct gdma_sge *)((uint8_t *)wqe->client_oob + + client_oob_size); + wqe->num_sge1 = (wq_size - offset - sizeof(union gdma_oob) + - client_oob_size) / sizeof(struct gdma_sge); + wqe->sgl2 = (struct gdma_sge *)wq->buffer; + wqe->num_sge2 = total_num_sges - wqe->num_sge1; + } + } + + zero_wqe_content(wqe); + return 0; +} + +static inline void gdma_write_sge(struct gdma_wqe *wqe, void *oob_sge, + struct ibv_sge *sge, uint32_t num_sge) +{ + struct gdma_sge *gdma_sgl = wqe->sgl1; + uint32_t num_sge1 = wqe->num_sge1; + uint32_t i; + + if (oob_sge) { + memcpy(gdma_sgl, oob_sge, sizeof(*gdma_sgl)); + gdma_sgl++; + num_sge1--; + } + + for (i = 0; i < num_sge; ++i, ++gdma_sgl) { + if (i == num_sge1) + gdma_sgl = wqe->sgl2; + + gdma_sgl->address = sge->addr; + gdma_sgl->size = sge->length; + gdma_sgl->mem_key = sge->lkey; + } +} + +static inline int +gdma_post_rq_wqe(struct mana_gdma_queue *wq, struct ibv_sge *sgl, void *oob, + uint32_t num_sge, enum gdma_work_req_flags flags, struct gdma_wqe *wqe) +{ + uint32_t wqe_size = get_wqe_size(num_sge); + int ret; + + ret = gdma_get_current_wqe(wq, INLINE_OOB_SMALL_SIZE, wqe_size, wqe); + if (ret) + return ret; + + wqe->gdma_oob->rx.num_sgl_entries = num_sge; + wqe->gdma_oob->rx.inline_client_oob_size = INLINE_OOB_SMALL_SIZE / sizeof(uint32_t); + wqe->gdma_oob->rx.check_sn = (flags & GDMA_WORK_REQ_CHECK_SN) != 0; + if (oob) + memcpy(wqe->client_oob, oob, INLINE_OOB_SMALL_SIZE); + + gdma_write_sge(wqe, NULL, sgl, num_sge); + gdma_advance_producer(wq, wqe->size_in_bu); + return 0; +} + +static int mana_ib_rc_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mana_context *mc = container_of(verbs_get_ctx(ibqp->context), + struct mana_context, ibv_ctx); + struct mana_qp *qp = container_of(ibqp, struct mana_qp, ibqp.qp); + struct mana_gdma_queue *wq = &qp->rc_qp.queues[USER_RC_RECV_QUEUE_RESPONDER]; + struct shadow_wqe_header *shadow_wqe; + struct gdma_wqe wqe_info; + uint8_t wqe_cnt = 0; + int ret = 0; + + pthread_spin_lock(&qp->rq_lock); + for (; wr; wr = wr->next) { + if (shadow_queue_full(&qp->shadow_rq)) { + verbs_err(&mc->ibv_ctx, "recv shadow queue full\n"); + ret = ENOMEM; + goto cleanup; + } + + ret = gdma_post_rq_wqe(wq, wr->sg_list, NULL, wr->num_sge, + GDMA_WORK_REQ_NONE, &wqe_info); + if (ret) { + verbs_err(&mc->ibv_ctx, "Failed to post RQ wqe , ret %d\n", ret); + goto cleanup; + } + wqe_cnt++; + + shadow_wqe = shadow_queue_producer_entry(&qp->shadow_rq); + memset(shadow_wqe, 0, sizeof(*shadow_wqe)); + shadow_wqe->opcode = IBV_WC_RECV; + shadow_wqe->wr_id = wr->wr_id; + shadow_wqe->unmasked_queue_offset = wqe_info.unmasked_wqe_index; + shadow_wqe->posted_wqe_size_in_bu = wqe_info.size_in_bu; + shadow_queue_advance_producer(&qp->shadow_rq); + } + +cleanup: + if (wqe_cnt) + gdma_ring_recv_doorbell(wq, wqe_cnt); + pthread_spin_unlock(&qp->rq_lock); + if (bad_wr && ret) + *bad_wr = wr; + return ret; +} + +int mana_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad) +{ + switch (ibqp->qp_type) { + case IBV_QPT_RC: + return mana_ib_rc_post_recv(ibqp, wr, bad); + default: + verbs_err(verbs_get_ctx(ibqp->context), "QPT not supported %d\n", ibqp->qp_type); + return EOPNOTSUPP; + } +} + +static inline bool is_opcode_supported(enum ibv_wr_opcode opcode) +{ + switch (opcode) { + case IBV_WR_RDMA_READ: + case IBV_WR_RDMA_WRITE: + case IBV_WR_SEND: + case IBV_WR_SEND_WITH_IMM: + case IBV_WR_RDMA_WRITE_WITH_IMM: + return true; + default: + return false; + } +} + +static inline enum ibv_wc_opcode + convert_wr_to_wc(enum ibv_wr_opcode opcode) +{ + switch (opcode) { + case IBV_WR_SEND_WITH_IMM: + case IBV_WR_SEND: + return IBV_WC_SEND; + case IBV_WR_RDMA_WRITE_WITH_IMM: + case IBV_WR_RDMA_WRITE: + return IBV_WC_RDMA_WRITE; + case IBV_WR_RDMA_READ: + return IBV_WC_RDMA_READ; + case IBV_WR_ATOMIC_CMP_AND_SWP: + return IBV_WC_COMP_SWAP; + case IBV_WR_ATOMIC_FETCH_AND_ADD: + return IBV_WC_FETCH_ADD; + default: + return 0xFF; + } +} + +static inline int +gdma_post_sq_wqe(struct mana_gdma_queue *wq, struct ibv_sge *sgl, struct rdma_send_oob *send_oob, + void *oob_sge, uint32_t num_sge, uint32_t mtu, + enum gdma_work_req_flags flags, struct gdma_wqe *wqe) +{ + struct ibv_sge dummy = {1, 0, 0}; + uint32_t total_sge, wqe_size; + int ret; + + if (num_sge == 0) { + num_sge = 1; + sgl = &dummy; + } + + total_sge = num_sge + (oob_sge ? 1 : 0); + wqe_size = get_large_wqe_size(total_sge); + + ret = gdma_get_current_wqe(wq, INLINE_OOB_LARGE_SIZE, wqe_size, wqe); + if (ret) + return ret; + + wqe->gdma_oob->tx.num_padding_sgls = wqe->num_sge1 + wqe->num_sge2 - total_sge; + wqe->gdma_oob->tx.num_sgl_entries = wqe->num_sge1 + wqe->num_sge2; + wqe->gdma_oob->tx.inline_client_oob_size = INLINE_OOB_LARGE_SIZE / sizeof(uint32_t); + if (flags & GDMA_WORK_REQ_EXTRA_LARGE_OOB) { + /* the first SGE was a part of the extra large OOB */ + wqe->gdma_oob->tx.num_sgl_entries -= 1; + wqe->gdma_oob->tx.inline_client_oob_size += 1; + } + wqe->gdma_oob->tx.client_oob_in_sgl = (flags & GDMA_WORK_REQ_OOB_IN_SGL) != 0; + wqe->gdma_oob->tx.consume_credit = (flags & GDMA_WORK_REQ_CONSUME_CREDIT) != 0; + wqe->gdma_oob->tx.fence = (flags & GDMA_WORK_REQ_FENCE) != 0; + wqe->gdma_oob->tx.client_data_unit = mtu; + wqe->gdma_oob->tx.check_sn = (flags & GDMA_WORK_REQ_CHECK_SN) != 0; + wqe->gdma_oob->tx.sgl_direct = (flags & GDMA_WORK_REQ_SGL_DIRECT) != 0; + + memcpy(wqe->client_oob, send_oob, INLINE_OOB_LARGE_SIZE); + + gdma_write_sge(wqe, oob_sge, sgl, num_sge); + gdma_advance_producer(wq, wqe->size_in_bu); + return 0; +} + +static inline int +mana_ib_rc_post_send_request(struct mana_qp *qp, struct ibv_send_wr *wr, + struct rc_sq_shadow_wqe *shadow_wqe) +{ + enum gdma_work_req_flags flags = GDMA_WORK_REQ_NONE; + struct extra_large_wqe extra_wqe = {0}; + struct rdma_send_oob send_oob = {0}; + struct gdma_wqe gdma_wqe = {0}; + uint32_t num_sge = wr->num_sge; + void *oob_sge = NULL; + uint32_t msg_sz = 0; + int i, ret; + + for (i = 0; i < num_sge; i++) + msg_sz += wr->sg_list[i].length; + + if (wr->opcode == IBV_WR_RDMA_READ) { + struct rdma_recv_oob recv_oob = {0}; + + recv_oob.psn_start = qp->rc_qp.sq_psn; + ret = gdma_post_rq_wqe(&qp->rc_qp.queues[USER_RC_RECV_QUEUE_REQUESTER], wr->sg_list, + &recv_oob, num_sge, GDMA_WORK_REQ_CHECK_SN, &gdma_wqe); + if (ret) { + verbs_err(verbs_get_ctx(qp->ibqp.qp.context), + "rc post Read data WQE error, ret %d\n", ret); + goto cleanup; + } + shadow_wqe->read_posted_wqe_size_in_bu = gdma_wqe.size_in_bu; + gdma_ring_recv_doorbell(&qp->rc_qp.queues[USER_RC_RECV_QUEUE_REQUESTER], 1); + // for reads no sge to use dummy sgl + num_sge = 0; + } + + send_oob.wqe_type = convert_wr_to_hw_opcode(wr->opcode); + send_oob.fence = (wr->send_flags & IBV_SEND_FENCE) != 0; + send_oob.signaled = (wr->send_flags & IBV_SEND_SIGNALED) != 0; + send_oob.solicited = (wr->send_flags & IBV_SEND_SOLICITED) != 0; + send_oob.psn = qp->rc_qp.sq_psn; + send_oob.ssn = qp->rc_qp.sq_ssn; + + switch (wr->opcode) { + case IBV_WR_SEND_WITH_INV: + flags |= GDMA_WORK_REQ_CHECK_SN; + send_oob.send.invalidate_key = wr->invalidate_rkey; + break; + case IBV_WR_SEND_WITH_IMM: + send_oob.send.immediate = htole32(be32toh(wr->imm_data)); + SWITCH_FALLTHROUGH; + case IBV_WR_SEND: + flags |= GDMA_WORK_REQ_CHECK_SN; + break; + case IBV_WR_RDMA_WRITE_WITH_IMM: + flags |= GDMA_WORK_REQ_CHECK_SN; + flags |= GDMA_WORK_REQ_EXTRA_LARGE_OOB; + extra_wqe.immediate = htole32(be32toh(wr->imm_data)); + oob_sge = &extra_wqe; + SWITCH_FALLTHROUGH; + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_READ: + send_oob.rdma.address_hi = (uint32_t)(wr->wr.rdma.remote_addr >> 32); + send_oob.rdma.address_low = (uint32_t)(wr->wr.rdma.remote_addr & 0xFFFFFFFF); + send_oob.rdma.rkey = wr->wr.rdma.rkey; + send_oob.rdma.dma_len = msg_sz; + break; + default: + goto cleanup; + } + + ret = gdma_post_sq_wqe(&qp->rc_qp.queues[USER_RC_SEND_QUEUE_REQUESTER], wr->sg_list, + &send_oob, oob_sge, num_sge, MTU_SIZE(qp->mtu), flags, &gdma_wqe); + if (ret) { + verbs_err(verbs_get_ctx(qp->ibqp.qp.context), + "rc post send error, ret %d\n", ret); + goto cleanup; + } + + qp->rc_qp.sq_psn = PSN_ADD(qp->rc_qp.sq_psn, PSN_DELTA(msg_sz, qp->mtu)); + qp->rc_qp.sq_ssn = PSN_INC(qp->rc_qp.sq_ssn); + + shadow_wqe->header.wr_id = wr->wr_id; + shadow_wqe->header.opcode = convert_wr_to_wc(wr->opcode); + shadow_wqe->header.flags = (wr->send_flags & IBV_SEND_SIGNALED) ? 0 : MANA_NO_SIGNAL_WC; + shadow_wqe->header.posted_wqe_size_in_bu = gdma_wqe.size_in_bu; + shadow_wqe->header.unmasked_queue_offset = gdma_wqe.unmasked_wqe_index; + shadow_wqe->end_psn = PSN_DEC(qp->rc_qp.sq_psn); + + return 0; + +cleanup: + return EINVAL; +} + +static int mana_ib_rc_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct mana_qp *qp = container_of(ibqp, struct mana_qp, ibqp.qp); + int ret = 0; + bool ring = false; + + pthread_spin_lock(&qp->sq_lock); + + for (; wr; wr = wr->next) { + if ((wr->send_flags & IBV_SEND_SIGNALED) && shadow_queue_full(&qp->shadow_sq)) { + verbs_err(verbs_get_ctx(ibqp->context), "shadow queue full\n"); + ret = ENOMEM; + goto cleanup; + } + + if (!is_opcode_supported(wr->opcode)) { + ret = EINVAL; + goto cleanup; + } + + /* Fill shadow queue data */ + struct rc_sq_shadow_wqe *shadow_wqe = (struct rc_sq_shadow_wqe *) + shadow_queue_producer_entry(&qp->shadow_sq); + memset(shadow_wqe, 0, sizeof(struct rc_sq_shadow_wqe)); + + ret = mana_ib_rc_post_send_request(qp, wr, shadow_wqe); + if (ret) { + verbs_err(verbs_get_ctx(qp->ibqp.qp.context), + "Failed to post send request ret %d\n", ret); + goto cleanup; + } + ring = true; + + shadow_queue_advance_producer(&qp->shadow_sq); + mana_ib_update_shared_mem_right_offset(qp, shadow_wqe->header.unmasked_queue_offset); + } + +cleanup: + if (ring) + gdma_ring_send_doorbell(&qp->rc_qp.queues[USER_RC_SEND_QUEUE_REQUESTER]); + pthread_spin_unlock(&qp->sq_lock); + if (bad_wr && ret) + *bad_wr = wr; + + return ret; +} + +int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad) +{ + switch (ibqp->qp_type) { + case IBV_QPT_RC: + return mana_ib_rc_post_send(ibqp, wr, bad); + default: + verbs_err(verbs_get_ctx(ibqp->context), "QPT not supported %d\n", ibqp->qp_type); + return EOPNOTSUPP; + } +}