Skip to content

Commit 11f552e

Browse files
mikijoyjgunthorpe
authored andcommitted
IB/mlx5: Test write combining support
Linux can run in all sorts of physical machines and VMs where write combining may or may not be supported. Currently there is no way to reliably tell if the system supports WC, or not. The driver uses WC to optimize posting work to the HCA, and getting this wrong in either direction can cause a significant performance loss. Add a test in mlx5_ib initialization process to test whether write-combining is supported on the machine. The test will run as part of the enable_driver callback to ensure that the test runs after the device is setup and can create and modify the QP needed, but runs before the device is exposed to the users. The test opens UD QP and posts NOP WQEs, the WQE written to the BlueFlame is different from the WQE in memory, requesting CQE only on the BlueFlame WQE. By checking whether we received a completion on one of these WQEs we can know if BlueFlame succeeded and this write-combining must be supported. Change reporting of BlueFlame support to be dependent on write-combining support instead of the FW's guess as to what the machine can do. Link: https://lore.kernel.org/r/20191027062234.10993-1-leon@kernel.org Signed-off-by: Michael Guralnik <michaelgur@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
1 parent 546d300 commit 11f552e

File tree

4 files changed

+223
-3
lines changed

4 files changed

+223
-3
lines changed

drivers/infiniband/hw/mlx5/main.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1796,7 +1796,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
17961796
return -EINVAL;
17971797

17981798
resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
1799-
if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
1799+
if (dev->wc_support)
18001800
resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
18011801
resp.cache_line_size = cache_line_size();
18021802
resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
@@ -6256,6 +6256,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
62566256
.disassociate_ucontext = mlx5_ib_disassociate_ucontext,
62576257
.drain_rq = mlx5_ib_drain_rq,
62586258
.drain_sq = mlx5_ib_drain_sq,
6259+
.enable_driver = mlx5_ib_enable_driver,
62596260
.fill_res_entry = mlx5_ib_fill_res_entry,
62606261
.fill_stat_entry = mlx5_ib_fill_stat_entry,
62616262
.get_dev_fw_str = get_dev_fw_str,
@@ -6699,6 +6700,18 @@ static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev)
66996700
}
67006701
}
67016702

6703+
int mlx5_ib_enable_driver(struct ib_device *dev)
6704+
{
6705+
struct mlx5_ib_dev *mdev = to_mdev(dev);
6706+
int ret;
6707+
6708+
ret = mlx5_ib_test_wc(mdev);
6709+
mlx5_ib_dbg(mdev, "Write-Combining %s",
6710+
mdev->wc_support ? "supported" : "not supported");
6711+
6712+
return ret;
6713+
}
6714+
67026715
void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
67036716
const struct mlx5_ib_profile *profile,
67046717
int stage)

drivers/infiniband/hw/mlx5/mem.c

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include <rdma/ib_umem.h>
3535
#include <rdma/ib_umem_odp.h>
3636
#include "mlx5_ib.h"
37+
#include <linux/jiffies.h>
3738

3839
/* @umem: umem object to scan
3940
* @addr: ib virtual address requested by the user
@@ -216,3 +217,201 @@ int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
216217
*offset = buf_off >> ilog2(off_size);
217218
return 0;
218219
}
220+
221+
#define WR_ID_BF 0xBF
222+
#define WR_ID_END 0xBAD
223+
#define TEST_WC_NUM_WQES 255
224+
#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100)
225+
static int post_send_nop(struct mlx5_ib_dev *dev, struct ib_qp *ibqp, u64 wr_id,
226+
bool signaled)
227+
{
228+
struct mlx5_ib_qp *qp = to_mqp(ibqp);
229+
struct mlx5_wqe_ctrl_seg *ctrl;
230+
struct mlx5_bf *bf = &qp->bf;
231+
__be32 mmio_wqe[16] = {};
232+
unsigned long flags;
233+
unsigned int idx;
234+
int i;
235+
236+
if (unlikely(dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR))
237+
return -EIO;
238+
239+
spin_lock_irqsave(&qp->sq.lock, flags);
240+
241+
idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
242+
ctrl = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx);
243+
244+
memset(ctrl, 0, sizeof(struct mlx5_wqe_ctrl_seg));
245+
ctrl->fm_ce_se = signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0;
246+
ctrl->opmod_idx_opcode =
247+
cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | MLX5_OPCODE_NOP);
248+
ctrl->qpn_ds = cpu_to_be32((sizeof(struct mlx5_wqe_ctrl_seg) / 16) |
249+
(qp->trans_qp.base.mqp.qpn << 8));
250+
251+
qp->sq.wrid[idx] = wr_id;
252+
qp->sq.w_list[idx].opcode = MLX5_OPCODE_NOP;
253+
qp->sq.wqe_head[idx] = qp->sq.head + 1;
254+
qp->sq.cur_post += DIV_ROUND_UP(sizeof(struct mlx5_wqe_ctrl_seg),
255+
MLX5_SEND_WQE_BB);
256+
qp->sq.w_list[idx].next = qp->sq.cur_post;
257+
qp->sq.head++;
258+
259+
memcpy(mmio_wqe, ctrl, sizeof(*ctrl));
260+
((struct mlx5_wqe_ctrl_seg *)&mmio_wqe)->fm_ce_se |=
261+
MLX5_WQE_CTRL_CQ_UPDATE;
262+
263+
/* Make sure that descriptors are written before
264+
* updating doorbell record and ringing the doorbell
265+
*/
266+
wmb();
267+
268+
qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);
269+
270+
/* Make sure doorbell record is visible to the HCA before
271+
* we hit doorbell
272+
*/
273+
wmb();
274+
for (i = 0; i < 8; i++)
275+
mlx5_write64(&mmio_wqe[i * 2],
276+
bf->bfreg->map + bf->offset + i * 8);
277+
278+
bf->offset ^= bf->buf_size;
279+
280+
spin_unlock_irqrestore(&qp->sq.lock, flags);
281+
282+
return 0;
283+
}
284+
285+
static int test_wc_poll_cq_result(struct mlx5_ib_dev *dev, struct ib_cq *cq)
286+
{
287+
int ret;
288+
struct ib_wc wc = {};
289+
unsigned long end = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES;
290+
291+
do {
292+
ret = ib_poll_cq(cq, 1, &wc);
293+
if (ret < 0 || wc.status)
294+
return ret < 0 ? ret : -EINVAL;
295+
if (ret)
296+
break;
297+
} while (!time_after(jiffies, end));
298+
299+
if (!ret)
300+
return -ETIMEDOUT;
301+
302+
if (wc.wr_id != WR_ID_BF)
303+
ret = 0;
304+
305+
return ret;
306+
}
307+
308+
static int test_wc_do_send(struct mlx5_ib_dev *dev, struct ib_qp *qp)
309+
{
310+
int err, i;
311+
312+
for (i = 0; i < TEST_WC_NUM_WQES; i++) {
313+
err = post_send_nop(dev, qp, WR_ID_BF, false);
314+
if (err)
315+
return err;
316+
}
317+
318+
return post_send_nop(dev, qp, WR_ID_END, true);
319+
}
320+
321+
int mlx5_ib_test_wc(struct mlx5_ib_dev *dev)
322+
{
323+
struct ib_cq_init_attr cq_attr = { .cqe = TEST_WC_NUM_WQES + 1 };
324+
int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
325+
struct ib_qp_init_attr qp_init_attr = {
326+
.cap = { .max_send_wr = TEST_WC_NUM_WQES },
327+
.qp_type = IB_QPT_UD,
328+
.sq_sig_type = IB_SIGNAL_REQ_WR,
329+
.create_flags = MLX5_IB_QP_CREATE_WC_TEST,
330+
};
331+
struct ib_qp_attr qp_attr = { .port_num = 1 };
332+
struct ib_device *ibdev = &dev->ib_dev;
333+
struct ib_qp *qp;
334+
struct ib_cq *cq;
335+
struct ib_pd *pd;
336+
int ret;
337+
338+
if (!MLX5_CAP_GEN(dev->mdev, bf))
339+
return 0;
340+
341+
if (!dev->mdev->roce.roce_en &&
342+
port_type_cap == MLX5_CAP_PORT_TYPE_ETH) {
343+
if (mlx5_core_is_pf(dev->mdev))
344+
dev->wc_support = true;
345+
return 0;
346+
}
347+
348+
ret = mlx5_alloc_bfreg(dev->mdev, &dev->wc_bfreg, true, false);
349+
if (ret)
350+
goto print_err;
351+
352+
if (!dev->wc_bfreg.wc)
353+
goto out1;
354+
355+
pd = ib_alloc_pd(ibdev, 0);
356+
if (IS_ERR(pd)) {
357+
ret = PTR_ERR(pd);
358+
goto out1;
359+
}
360+
361+
cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr);
362+
if (IS_ERR(cq)) {
363+
ret = PTR_ERR(cq);
364+
goto out2;
365+
}
366+
367+
qp_init_attr.recv_cq = cq;
368+
qp_init_attr.send_cq = cq;
369+
qp = ib_create_qp(pd, &qp_init_attr);
370+
if (IS_ERR(qp)) {
371+
ret = PTR_ERR(qp);
372+
goto out3;
373+
}
374+
375+
qp_attr.qp_state = IB_QPS_INIT;
376+
ret = ib_modify_qp(qp, &qp_attr,
377+
IB_QP_STATE | IB_QP_PORT | IB_QP_PKEY_INDEX |
378+
IB_QP_QKEY);
379+
if (ret)
380+
goto out4;
381+
382+
qp_attr.qp_state = IB_QPS_RTR;
383+
ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
384+
if (ret)
385+
goto out4;
386+
387+
qp_attr.qp_state = IB_QPS_RTS;
388+
ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
389+
if (ret)
390+
goto out4;
391+
392+
ret = test_wc_do_send(dev, qp);
393+
if (ret < 0)
394+
goto out4;
395+
396+
ret = test_wc_poll_cq_result(dev, cq);
397+
if (ret > 0) {
398+
dev->wc_support = true;
399+
ret = 0;
400+
}
401+
402+
out4:
403+
ib_destroy_qp(qp);
404+
out3:
405+
ib_destroy_cq(cq);
406+
out2:
407+
ib_dealloc_pd(pd);
408+
out1:
409+
mlx5_free_bfreg(dev->mdev, &dev->wc_bfreg);
410+
print_err:
411+
if (ret)
412+
mlx5_ib_err(
413+
dev,
414+
"Error %d while trying to test write-combining support\n",
415+
ret);
416+
return ret;
417+
}

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ struct mlx5_ib_flow_db {
248248
* rely on the range reserved for that use in the ib_qp_create_flags enum.
249249
*/
250250
#define MLX5_IB_QP_CREATE_SQPN_QP1 IB_QP_CREATE_RESERVED_START
251+
#define MLX5_IB_QP_CREATE_WC_TEST (IB_QP_CREATE_RESERVED_START << 1)
251252

252253
struct wr_list {
253254
u16 opcode;
@@ -966,6 +967,7 @@ struct mlx5_ib_dev {
966967
u8 fill_delay:1;
967968
u8 is_rep:1;
968969
u8 lag_active:1;
970+
u8 wc_support:1;
969971
struct umr_common umrc;
970972
/* sync used page count stats
971973
*/
@@ -993,6 +995,7 @@ struct mlx5_ib_dev {
993995
/* Array with num_ports elements */
994996
struct mlx5_ib_port *port;
995997
struct mlx5_sq_bfreg bfreg;
998+
struct mlx5_sq_bfreg wc_bfreg;
996999
struct mlx5_sq_bfreg fp_bfreg;
9971000
struct mlx5_ib_delay_drop delay_drop;
9981001
const struct mlx5_ib_profile *profile;
@@ -1506,4 +1509,7 @@ static inline bool mlx5_ib_can_use_umr(struct mlx5_ib_dev *dev,
15061509

15071510
return true;
15081511
}
1512+
1513+
int mlx5_ib_enable_driver(struct ib_device *dev);
1514+
int mlx5_ib_test_wc(struct mlx5_ib_dev *dev);
15091515
#endif /* MLX5_IB_H */

drivers/infiniband/hw/mlx5/qp.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,11 +1041,14 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
10411041
IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
10421042
IB_QP_CREATE_IPOIB_UD_LSO |
10431043
IB_QP_CREATE_NETIF_QP |
1044-
MLX5_IB_QP_CREATE_SQPN_QP1))
1044+
MLX5_IB_QP_CREATE_SQPN_QP1 |
1045+
MLX5_IB_QP_CREATE_WC_TEST))
10451046
return -EINVAL;
10461047

10471048
if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)
10481049
qp->bf.bfreg = &dev->fp_bfreg;
1050+
else if (init_attr->create_flags & MLX5_IB_QP_CREATE_WC_TEST)
1051+
qp->bf.bfreg = &dev->wc_bfreg;
10491052
else
10501053
qp->bf.bfreg = &dev->bfreg;
10511054

@@ -5328,7 +5331,6 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
53285331
* we hit doorbell */
53295332
wmb();
53305333

5331-
/* currently we support only regular doorbells */
53325334
mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset);
53335335
/* Make sure doorbells don't leak out of SQ spinlock
53345336
* and reach the HCA out of order.

0 commit comments

Comments
 (0)