Skip to content

Commit

Permalink
TL/MLX5: rcache (#753)
Browse files Browse the repository at this point in the history
* TL/MLX5: rcache

* CODESTYLE: clang-tidy

* TL/MLX5: fixes and  Linter errors

* TL/MLX5: minor reviews

* TL/MLX5: minor reviews

* TL/MLX5: minor reviews

* TL/MLX5: revisions

* TL/MLX5: fix Linter error

---------

Co-authored-by: Valentin Petrov <valentinp@nvidia.com>
  • Loading branch information
samnordmann and Valentin Petrov authored May 23, 2023
1 parent c2a5062 commit 17a922c
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 3 deletions.
3 changes: 2 additions & 1 deletion src/components/tl/mlx5/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ sources = \
tl_mlx5_wqe.h \
tl_mlx5_wqe.c \
tl_mlx5_pd.h \
tl_mlx5_pd.c
tl_mlx5_pd.c \
tl_mlx5_rcache.c

module_LTLIBRARIES = libucc_tl_mlx5.la
libucc_tl_mlx5_la_SOURCES = $(sources)
Expand Down
11 changes: 11 additions & 0 deletions src/components/tl/mlx5/tl_mlx5.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,17 @@ typedef struct ucc_tl_mlx5_team {
UCC_CLASS_DECLARE(ucc_tl_mlx5_team_t, ucc_base_context_t *,
const ucc_base_team_params_t *);

ucc_status_t tl_mlx5_rcache_create(ucc_tl_mlx5_context_t *ctx);

typedef struct ucc_tl_mlx5_reg {
struct ibv_mr *mr;
} ucc_tl_mlx5_reg_t;

typedef struct ucc_tl_mlx5_rcache_region {
ucc_rcache_region_t super;
ucc_tl_mlx5_reg_t reg;
} ucc_tl_mlx5_rcache_region_t;

#define UCC_TL_MLX5_SUPPORTED_COLLS (UCC_COLL_TYPE_ALLTOALL)

#define UCC_TL_MLX5_TEAM_LIB(_team) \
Expand Down
14 changes: 12 additions & 2 deletions src/components/tl/mlx5/tl_mlx5_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_context_t,
status = ucc_mpool_init(
&self->req_mp, 0,
ucc_max(sizeof(ucc_tl_mlx5_task_t), sizeof(ucc_tl_mlx5_schedule_t)), 0,
UCC_CACHE_LINE_SIZE, 8, UINT_MAX, NULL, params->thread_mode,
"tl_mlx5_req_mp");
UCC_CACHE_LINE_SIZE, 8, UINT_MAX, &ucc_coll_task_mpool_ops,
params->thread_mode, "tl_mlx5_req_mp");
if (UCC_OK != status) {
tl_error(self->super.super.lib,
"failed to initialize tl_mlx5_req mpool");
Expand All @@ -48,6 +48,9 @@ UCC_CLASS_INIT_FUNC(ucc_tl_mlx5_context_t,
UCC_CLASS_CLEANUP_FUNC(ucc_tl_mlx5_context_t)
{
tl_debug(self->super.super.lib, "finalizing tl context: %p", self);
if (self->rcache) {
ucc_rcache_destroy(self->rcache);
}

if (ucc_tl_mlx5_remove_shared_ctx_pd(self) != UCC_OK) {
tl_error(self->super.super.lib, "failed to free ib ctx and pd");
Expand Down Expand Up @@ -245,8 +248,15 @@ ucc_status_t ucc_tl_mlx5_context_create_epilog(ucc_base_context_t *context)
goto err;
}

status = tl_mlx5_rcache_create(ctx);
if (UCC_OK != status) {
tl_error(context->lib, "failed to create rcache");
goto err;
}

ucc_free(sbcast_data);
ucc_topo_cleanup(topo);

return UCC_OK;

err:
Expand Down
75 changes: 75 additions & 0 deletions src/components/tl/mlx5/tl_mlx5_rcache.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/**
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/

#include "tl_mlx5.h"

static ucs_status_t
rcache_reg_mr(void *context, ucc_rcache_t *rcache, //NOLINT: rcache is unused
void *arg, ucc_rcache_region_t *rregion,
uint16_t flags) //NOLINT: flags is unused
{
ucc_tl_mlx5_context_t *ctx =
(ucc_tl_mlx5_context_t *)context;
void *addr = (void *)rregion->super.start;
size_t length = (size_t)(rregion->super.end
- rregion->super.start);
int *change_flag = (int *)arg;
ucc_tl_mlx5_rcache_region_t *mlx5_rregion = ucc_derived_of(rregion,
ucc_tl_mlx5_rcache_region_t);

*change_flag = 1;
mlx5_rregion->reg.mr =
ibv_reg_mr(ctx->shared_pd, addr, length,
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
if (!mlx5_rregion->reg.mr) {
tl_error(ctx->super.super.lib, "failed to register memory");
return UCS_ERR_NO_MESSAGE;
}
return UCS_OK;
}

static void rcache_dereg_mr(void *context, //NOLINT: context is unused
ucc_rcache_t *rcache, //NOLINT: rcache is unused
ucc_rcache_region_t *rregion)
{
ucc_tl_mlx5_rcache_region_t *mlx5_rregion =
ucc_derived_of(rregion, ucc_tl_mlx5_rcache_region_t);

ibv_dereg_mr(mlx5_rregion->reg.mr);
}

static void ucc_tl_mlx5_rcache_dump_region_cb(void *context, //NOLINT
ucc_rcache_t *rcache, //NOLINT
ucs_rcache_region_t *rregion,
char *buf, size_t max)
{
ucc_tl_mlx5_rcache_region_t *mlx5_rregion =
ucc_derived_of(rregion, ucc_tl_mlx5_rcache_region_t);

snprintf(buf, max, "bar ptr:%p", mlx5_rregion->reg.mr);
}

static ucc_rcache_ops_t ucc_rcache_ops = {
.mem_reg = rcache_reg_mr,
.mem_dereg = rcache_dereg_mr,
.dump_region = ucc_tl_mlx5_rcache_dump_region_cb
};

ucc_status_t tl_mlx5_rcache_create(ucc_tl_mlx5_context_t *ctx)
{
ucc_rcache_params_t rcache_params;

rcache_params.region_struct_size = sizeof(ucc_tl_mlx5_rcache_region_t);
rcache_params.alignment = UCS_PGT_ADDR_ALIGN;
rcache_params.max_alignment = ucc_get_page_size();
rcache_params.ucm_event_priority = 1000;
rcache_params.context = (void *)ctx;
rcache_params.ops = &ucc_rcache_ops;
rcache_params.ucm_events = UCM_EVENT_VM_UNMAPPED
| UCM_EVENT_MEM_TYPE_FREE;

return ucc_rcache_create(&rcache_params, "MLX5", &ctx->rcache);
}

0 comments on commit 17a922c

Please sign in to comment.