diff --git a/ompi/mca/mtl/ofi/README b/ompi/mca/mtl/ofi/README new file mode 100644 index 00000000000..753635c077b --- /dev/null +++ b/ompi/mca/mtl/ofi/README @@ -0,0 +1,69 @@ +OFI MTL + +The OFI MTL supports Libfabric (a.k.a. Open Fabrics Interfaces OFI, +https://ofiwg.github.io/libfabric/) tagged APIs (fi_tagged(3)). At +initialization time, the MTL queries libfabric for providers supporting tag matching +(fi_getinfo(3)). Libfabric will return a list of providers that satisfy the requested +capabilities, having the most performant one at the top of the list. +The user may modify the OFI provider selection with mca parameters +mtl_ofi_provider_include or mtl_ofi_provider_exclude. + +PROGRESS: +The MTL registers a progress function to opal_progress. There is currently +no support for asynchronous progress. The progress function reads multiple events +from the OFI provider Completion Queue (CQ) per iteration (defaults to 100, can be +modified with the mca mtl_ofi_progress_event_cnt) and iterates until the +completion queue is drained. + +COMPLETIONS: +Each operation uses a request type ompi_mtl_ofi_request_t which includes a reference +to an operation specific completion callback, an MPI request, and a context. The +context (fi_context) is used to map completion events with MPI_requests when reading the +CQ. + +OFI TAG: +MPI needs to send 96 bits of information per message (32 bits communicator id, +32 bits source rank, 32 bits MPI tag) but OFI only offers 64 bits tags. In +addition, the OFI MTL uses 4 bits of the OFI tag for the synchronous send protocol. +Therefore, there are only 60 bits available in the OFI tag for message usage. The +OFI MTL offers the mtl_ofi_tag_mode mca parameter with 4 modes to address this: + +"auto" (Default): +After the OFI provider is selected, a runtime check is performed to assess +FI_REMOTE_CQ_DATA and FI_DIRECTED_RECV support (see fi_tagged(3), fi_msg(2) +and fi_getinfo(3)). If supported, "ofi_tag_full" is used. If not supported, +fall back to "ofi_tag_1". + +"ofi_tag_1": +For providers that do not support FI_REMOTE_CQ_DATA, the OFI MTL will +trim the fields (Communicator ID, Source Rank, MPI tag) to make them fit the 60 +bits available bit in the OFI tag. There are two options available with different +number of bits for the Communicator ID and MPI tag fields. This tag distribution +offers: 12 bits for Communicator ID (max Communicator ID 4,095) subject to +provider reserved bits (see mem_tag_format below), 16 bits for Source Rank (max +Source Rank 65,535), 32 bits for MPI tag (max MPI tag is INT_MAX). + +"ofi_tag_2": +Same as 2 "ofi_tag_1" but offering a different OFI tag distribution for +applications that may require a greater number of supported Communicators at the +expense of fewer MPI tag bits. This tag distribution offers: 24 bits for +Communicator ID (max Communicator ED 16,777,215. See mem_tag_format below), 16 +bits for Source Rank (max Source Rank 65,535), 20 bits for MPI tag (max MPI tag +524,287). + +"ofi_tag_full": +For executions that cannot accept trimming source rank or MPI tag, this mode sends +source rank for each message in the CQ DATA. The Source Rank is made available at +the remote process CQ (FI_CQ_FORMAT_TAGGED is used, see fi_cq(3)) at the completion +of the matching receive operation. Since the minimum size for FI_REMOTE_CQ_DATA +is 32 bits, the Source Rank fits with no limitations. The OFI tag is used for the +Communicator id (28 bits, max Communicator ID 268,435,455. See mem_tag_format below), +and the MPI tag (max MPI tag is INT_MAX). If this mode is selected by the user +and FI_REMOTE_CQ_DATA or FI_DIRECTED_RECV are not supported, the execution will abort. + +mem_tag_format (fi_endpoint(3)) +Some providers can reserve the higher order bits from the OFI tag for internal purposes. +This is signaled in mem_tag_format (see fi_endpoint(3)) by setting higher order bits +to zero. In such cases, the OFI MTL will reduce the number of communicator ids supported +by reducing the bits available for the communicator ID field in the OFI tag. + diff --git a/ompi/mca/mtl/ofi/mtl_ofi.c b/ompi/mca/mtl/ofi/mtl_ofi.c index ed6aae6bc44..7e19f170e64 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.c +++ b/ompi/mca/mtl/ofi/mtl_ofi.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2018 Intel, Inc. All rights reserved * * $COPYRIGHT$ * @@ -14,8 +14,8 @@ OMPI_DECLSPEC extern mca_mtl_ofi_component_t mca_mtl_ofi_component; mca_mtl_ofi_module_t ompi_mtl_ofi = { { - 8191, /* max cid - 2^13 - 1 */ - (1UL << 30), /* max tag value - must allow negatives */ + (int)((1ULL << MTL_OFI_CID_BIT_COUNT_1) - 1), /* max cid */ + (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_1 - 1)) - 1) ,/* max tag value */ 0, /* request reserve space */ 0, /* flags */ diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index f4c5f7c3f9a..b69f584b16c 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2017 Intel, Inc. All rights reserved + * Copyright (c) 2013-2018 Intel, Inc. All rights reserved * Copyright (c) 2017 Los Alamos National Security, LLC. All rights * reserved. * @@ -244,6 +244,7 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl, ompi_proc_t *ompi_proc = NULL; mca_mtl_ofi_endpoint_t *endpoint = NULL; ompi_mtl_ofi_request_t *ack_req = NULL; /* For synchronous send */ + fi_addr_t src_addr = 0; ompi_proc = ompi_comm_peer_lookup(comm, dest); endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); @@ -255,6 +256,15 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl, ofi_req->length = length; ofi_req->status.MPI_ERROR = OMPI_SUCCESS; + if (ompi_mtl_ofi.fi_cq_data) { + match_bits = mtl_ofi_create_send_tag_CQD(comm->c_contextid, tag); + src_addr = endpoint->peer_fiaddr; + } else { + match_bits = mtl_ofi_create_send_tag(comm->c_contextid, + comm->c_my_rank, tag); + /* src_addr is ignored when FI_DIRECTED_RECV is not supported */ + } + if (OPAL_UNLIKELY(MCA_PML_BASE_SEND_SYNCHRONOUS == mode)) { ack_req = malloc(sizeof(ompi_mtl_ofi_request_t)); assert(ack_req); @@ -263,14 +273,15 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl, ack_req->error_callback = ompi_mtl_ofi_send_ack_error_callback; ofi_req->completion_count = 2; - MTL_OFI_SET_SEND_BITS(match_bits, comm->c_contextid, - comm->c_my_rank, tag, MTL_OFI_SYNC_SEND); + + MTL_OFI_SET_SYNC_SEND(match_bits); + MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ep, NULL, 0, NULL, - endpoint->peer_fiaddr, - match_bits | MTL_OFI_SYNC_SEND_ACK, + src_addr, + match_bits | ompi_mtl_ofi.sync_send_ack, 0, /* Exact match, no ignore bits */ (void *) &ack_req->ctx)); if (OPAL_UNLIKELY(0 > ret)) { @@ -282,20 +293,30 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl, } } else { ofi_req->completion_count = 1; - MTL_OFI_SET_SEND_BITS(match_bits, comm->c_contextid, - comm->c_my_rank, tag, 0); } if (ompi_mtl_ofi.max_inject_size >= length) { - MTL_OFI_RETRY_UNTIL_DONE(fi_tinject(ompi_mtl_ofi.ep, + if (ompi_mtl_ofi.fi_cq_data) { + MTL_OFI_RETRY_UNTIL_DONE(fi_tinjectdata(ompi_mtl_ofi.ep, + start, + length, + comm->c_my_rank, + endpoint->peer_fiaddr, + match_bits)); + } else { + MTL_OFI_RETRY_UNTIL_DONE(fi_tinject(ompi_mtl_ofi.ep, start, length, endpoint->peer_fiaddr, match_bits)); + } + if (OPAL_UNLIKELY(0 > ret)) { + char *fi_api = ompi_mtl_ofi.fi_cq_data ? "fi_tinjectddata" : "fi_tinject"; opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_tinject failed: %s(%zd)", - __FILE__, __LINE__, fi_strerror(-ret), ret); + "%s:%d: %s failed: %s(%zd)", + __FILE__, __LINE__,fi_api, fi_strerror(-ret), ret); + if (ack_req) { fi_cancel((fid_t)ompi_mtl_ofi.ep, &ack_req->ctx); free(ack_req); @@ -305,17 +326,29 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl, ofi_req->event_callback(NULL,ofi_req); } else { - MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ep, + if (ompi_mtl_ofi.fi_cq_data) { + MTL_OFI_RETRY_UNTIL_DONE(fi_tsenddata(ompi_mtl_ofi.ep, start, length, NULL, + comm->c_my_rank, endpoint->peer_fiaddr, match_bits, (void *) &ofi_req->ctx)); + } else { + MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ep, + start, + length, + NULL, + endpoint->peer_fiaddr, + match_bits, + (void *) &ofi_req->ctx)); + } if (OPAL_UNLIKELY(0 > ret)) { + char *fi_api = ompi_mtl_ofi.fi_cq_data ? "fi_tsendddata" : "fi_send"; opal_output_verbose(1, ompi_mtl_base_framework.framework_output, - "%s:%d: fi_tsend failed: %s(%zd)", - __FILE__, __LINE__, fi_strerror(-ret), ret); + "%s:%d: %s failed: %s(%zd)", + __FILE__, __LINE__,fi_api, fi_strerror(-ret), ret); return ompi_mtl_ofi_get_error(ret); } } @@ -415,7 +448,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc, ssize_t ret; ompi_proc_t *ompi_proc = NULL; mca_mtl_ofi_endpoint_t *endpoint = NULL; - int src; + int src = mtl_ofi_get_source(wc); ompi_status_public_t *status = NULL; assert(ofi_req->super.ompi_req); @@ -427,7 +460,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc, */ ofi_req->req_started = true; - status->MPI_SOURCE = MTL_OFI_GET_SOURCE(wc->tag); + status->MPI_SOURCE = src; status->MPI_TAG = MTL_OFI_GET_TAG(wc->tag); status->_ucount = wc->len; @@ -474,7 +507,6 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc, * we need to extract the source's actual address. */ if (ompi_mtl_ofi.any_addr == ofi_req->remote_addr) { - src = MTL_OFI_GET_SOURCE(wc->tag); ompi_proc = ompi_comm_peer_lookup(ofi_req->comm, src); endpoint = ompi_mtl_ofi_get_endpoint(ofi_req->mtl, ompi_proc); ofi_req->remote_addr = endpoint->peer_fiaddr; @@ -484,7 +516,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc, 0, NULL, ofi_req->remote_addr, - wc->tag | MTL_OFI_SYNC_SEND_ACK, + wc->tag | ompi_mtl_ofi.sync_send_ack, (void *) &ofi_req->ctx)); if (OPAL_UNLIKELY(0 > ret)) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, @@ -510,7 +542,7 @@ ompi_mtl_ofi_recv_error_callback(struct fi_cq_err_entry *error, assert(ofi_req->super.ompi_req); status = &ofi_req->super.ompi_req->req_status; status->MPI_TAG = MTL_OFI_GET_TAG(ofi_req->match_bits); - status->MPI_SOURCE = MTL_OFI_GET_SOURCE(ofi_req->match_bits); + status->MPI_SOURCE = mtl_ofi_get_source((struct fi_cq_tagged_entry *) error); switch (error->err) { case FI_ETRUNC: @@ -538,7 +570,7 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl, int ompi_ret = OMPI_SUCCESS; ssize_t ret; uint64_t match_bits, mask_bits; - fi_addr_t remote_addr; + fi_addr_t remote_addr = ompi_mtl_ofi.any_addr; ompi_proc_t *ompi_proc = NULL; mca_mtl_ofi_endpoint_t *endpoint = NULL; ompi_mtl_ofi_request_t *ofi_req = (ompi_mtl_ofi_request_t*) mtl_request; @@ -546,16 +578,22 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl, size_t length; bool free_after; - if (MPI_ANY_SOURCE != src) { - ompi_proc = ompi_comm_peer_lookup(comm, src); - endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); - remote_addr = endpoint->peer_fiaddr; + + if (ompi_mtl_ofi.fi_cq_data) { + if (MPI_ANY_SOURCE != src) { + ompi_proc = ompi_comm_peer_lookup(comm, src); + endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); + remote_addr = endpoint->peer_fiaddr; + } + + mtl_ofi_create_recv_tag_CQD(&match_bits, &mask_bits, comm->c_contextid, + tag); } else { - remote_addr = ompi_mtl_ofi.any_addr; + mtl_ofi_create_recv_tag(&match_bits, &mask_bits, comm->c_contextid, src, + tag); + /* src_addr is ignored when FI_DIRECTED_RECV is not used */ } - MTL_OFI_SET_RECV_BITS(match_bits, mask_bits, comm->c_contextid, src, tag); - ompi_ret = ompi_mtl_datatype_recv_buf(convertor, &start, &length, @@ -606,7 +644,7 @@ ompi_mtl_ofi_mrecv_callback(struct fi_cq_tagged_entry *wc, { struct mca_mtl_request_t *mrecv_req = ofi_req->mrecv_req; ompi_status_public_t *status = &mrecv_req->ompi_req->req_status; - status->MPI_SOURCE = MTL_OFI_GET_SOURCE(wc->tag); + status->MPI_SOURCE = mtl_ofi_get_source(wc); status->MPI_TAG = MTL_OFI_GET_TAG(wc->tag); status->MPI_ERROR = MPI_SUCCESS; status->_ucount = wc->len; @@ -628,7 +666,7 @@ ompi_mtl_ofi_mrecv_error_callback(struct fi_cq_err_entry *error, struct mca_mtl_request_t *mrecv_req = ofi_req->mrecv_req; ompi_status_public_t *status = &mrecv_req->ompi_req->req_status; status->MPI_TAG = MTL_OFI_GET_TAG(ofi_req->match_bits); - status->MPI_SOURCE = MTL_OFI_GET_SOURCE(ofi_req->match_bits); + status->MPI_SOURCE = mtl_ofi_get_source((struct fi_cq_tagged_entry *) error); switch (error->err) { case FI_ETRUNC: @@ -716,7 +754,7 @@ ompi_mtl_ofi_probe_callback(struct fi_cq_tagged_entry *wc, { ofi_req->match_state = 1; ofi_req->match_bits = wc->tag; - ofi_req->status.MPI_SOURCE = MTL_OFI_GET_SOURCE(wc->tag); + ofi_req->status.MPI_SOURCE = mtl_ofi_get_source(wc); ofi_req->status.MPI_TAG = MTL_OFI_GET_TAG(wc->tag); ofi_req->status.MPI_ERROR = MPI_SUCCESS; ofi_req->status._ucount = wc->len; @@ -749,22 +787,28 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl, struct ompi_mtl_ofi_request_t ofi_req; ompi_proc_t *ompi_proc = NULL; mca_mtl_ofi_endpoint_t *endpoint = NULL; - fi_addr_t remote_proc = 0; + fi_addr_t remote_proc = ompi_mtl_ofi.any_addr; uint64_t match_bits, mask_bits; ssize_t ret; struct fi_msg_tagged msg; uint64_t msgflags = FI_PEEK; - /** - * If the source is known, use its peer_fiaddr. - */ - if (MPI_ANY_SOURCE != src) { - ompi_proc = ompi_comm_peer_lookup( comm, src ); - endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); - remote_proc = endpoint->peer_fiaddr; - } + if (ompi_mtl_ofi.fi_cq_data) { + /* If the source is known, use its peer_fiaddr. */ + if (MPI_ANY_SOURCE != src) { + ompi_proc = ompi_comm_peer_lookup( comm, src ); + endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); + remote_proc = endpoint->peer_fiaddr; + } - MTL_OFI_SET_RECV_BITS(match_bits, mask_bits, comm->c_contextid, src, tag); + mtl_ofi_create_recv_tag_CQD(&match_bits, &mask_bits, comm->c_contextid, + tag); + } + else { + mtl_ofi_create_recv_tag(&match_bits, &mask_bits, comm->c_contextid, src, + tag); + /* src_addr is ignored when FI_DIRECTED_RECV is not used */ + } /** * fi_trecvmsg with FI_PEEK: @@ -829,7 +873,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl, struct ompi_mtl_ofi_request_t *ofi_req; ompi_proc_t *ompi_proc = NULL; mca_mtl_ofi_endpoint_t *endpoint = NULL; - fi_addr_t remote_proc = 0; + fi_addr_t remote_proc = ompi_mtl_ofi.any_addr; uint64_t match_bits, mask_bits; ssize_t ret; struct fi_msg_tagged msg; @@ -843,13 +887,22 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl, /** * If the source is known, use its peer_fiaddr. */ - if (MPI_ANY_SOURCE != src) { - ompi_proc = ompi_comm_peer_lookup( comm, src ); - endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); - remote_proc = endpoint->peer_fiaddr; - } - MTL_OFI_SET_RECV_BITS(match_bits, mask_bits, comm->c_contextid, src, tag); + if (ompi_mtl_ofi.fi_cq_data) { + if (MPI_ANY_SOURCE != src) { + ompi_proc = ompi_comm_peer_lookup( comm, src ); + endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc); + remote_proc = endpoint->peer_fiaddr; + } + + mtl_ofi_create_recv_tag_CQD(&match_bits, &mask_bits, comm->c_contextid, + tag); + } + else { + /* src_addr is ignored when FI_DIRECTED_RECV is not used */ + mtl_ofi_create_recv_tag(&match_bits, &mask_bits, comm->c_contextid, src, + tag); + } /** * fi_trecvmsg with FI_PEEK and FI_CLAIM: diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index 662fb38e796..bcc02d0d1f9 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2013-2017 Intel, Inc. All rights reserved + * Copyright (c) 2013-2018 Intel, Inc. All rights reserved * * Copyright (c) 2014-2017 Cisco Systems, Inc. All rights reserved * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights @@ -31,6 +31,7 @@ static char *prov_exclude; static int control_progress; static int data_progress; static int av_type; +static int ofi_tag_mode; /* * Enumerators @@ -68,6 +69,21 @@ mca_base_var_enum_value_t av_table_type[] = { {0, NULL} }; +enum { + MTL_OFI_TAG_AUTO=1, + MTL_OFI_TAG_1, + MTL_OFI_TAG_2, + MTL_OFI_TAG_FULL, +}; + +mca_base_var_enum_value_t ofi_tag_mode_type[] = { + {MTL_OFI_TAG_AUTO, "auto"}, + {MTL_OFI_TAG_1, "ofi_tag_1"}, + {MTL_OFI_TAG_2, "ofi_tag_2"}, + {MTL_OFI_TAG_FULL, "ofi_tag_full"}, + {0, NULL} +}; + mca_mtl_ofi_component_t mca_mtl_ofi_component = { { @@ -136,7 +152,37 @@ ompi_mtl_ofi_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &ompi_mtl_ofi.ofi_progress_event_count); - free(desc); + free(desc); + + ret = mca_base_var_enum_create ("ofi_tag_mode_type", ofi_tag_mode_type , &new_enum); + if (OPAL_SUCCESS != ret) { + return ret; + } + + ofi_tag_mode = MTL_OFI_TAG_AUTO; + asprintf(&desc, "Mode specifying how many bits to use for various MPI values in OFI/Libfabric" + " communications. Some Libfabric provider network types can support most of Open MPI" + " needs; others can only supply a limited number of bits, which then must be split" + " across the MPI communicator ID, MPI source rank, and MPI tag. Three different" + " splitting schemes are available: ofi_tag_full (%d bits for the communicator, %d bits" + " for the source rank, and %d bits for the tag), ofi_tag_1 (%d bits for the communicator" + ", %d bits source rank, %d bits tag), ofi_tag_2 (%d bits for the communicator" + ", %d bits source rank, %d bits tag). By default, this MCA variable is set to \"auto\"," + " which will first try to use ofi_tag_full, and if that fails, fall back to ofi_tag_1.", + MTL_OFI_CID_BIT_COUNT_DATA, 32, MTL_OFI_TAG_BIT_COUNT_DATA, + MTL_OFI_CID_BIT_COUNT_1, MTL_OFI_SOURCE_BIT_COUNT_1, MTL_OFI_TAG_BIT_COUNT_1, + MTL_OFI_CID_BIT_COUNT_2, MTL_OFI_SOURCE_BIT_COUNT_2, MTL_OFI_TAG_BIT_COUNT_2); + + mca_base_component_var_register (&mca_mtl_ofi_component.super.mtl_version, + "tag_mode", + desc, + MCA_BASE_VAR_TYPE_INT, new_enum, 0, 0, + OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, + &ofi_tag_mode); + + free(desc); + OBJ_RELEASE(new_enum); ret = mca_base_var_enum_create ("control_prog_type", control_prog_type, &new_enum); if (OPAL_SUCCESS != ret) { @@ -304,13 +350,96 @@ select_ofi_provider(struct fi_info *providers) return prov; } +/* Check if FI_REMOTE_CQ_DATA is supported, if so send the source rank there + * FI_DIRECTED_RECV is also needed so receives can discrimate the source + */ +static int +ompi_mtl_ofi_check_fi_remote_cq_data(int fi_version, + struct fi_info *hints, + struct fi_info *provider, + struct fi_info **prov_cq_data) +{ + int ret; + char *provider_name; + struct fi_info *hints_dup; + hints_dup = fi_dupinfo(hints); + + provider_name = strdup(provider->fabric_attr->prov_name); + hints_dup->fabric_attr->prov_name = provider_name; + hints_dup->caps |= FI_TAGGED | FI_DIRECTED_RECV; + /* Ask for the size that OMPI uses for the source rank number */ + hints_dup->domain_attr->cq_data_size = sizeof(int); + ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints_dup, prov_cq_data); + + if ((0 != ret) && (-FI_ENODATA != ret)) { + opal_show_help("help-mtl-ofi.txt", "OFI call fail", true, + "fi_getinfo", + ompi_process_info.nodename, __FILE__, __LINE__, + fi_strerror(-ret), -ret); + return ret; + } else if (-FI_ENODATA == ret) { + /* The provider does not support FI_REMOTE_CQ_DATA */ + prov_cq_data = NULL; + } + + fi_freeinfo(hints_dup); + return OMPI_SUCCESS; +} + +static void +ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode) { + switch (ofi_tag_mode) { + case MTL_OFI_TAG_1: + ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << MTL_OFI_CID_BIT_COUNT_1 ) - 1); + ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_1 - 1)) - 1); + + ompi_mtl_ofi.source_rank_tag_mask = MTL_OFI_SOURCE_TAG_MASK_1; + ompi_mtl_ofi.num_bits_source_rank = MTL_OFI_SOURCE_BIT_COUNT_1; + ompi_mtl_ofi.source_rank_mask = MTL_OFI_SOURCE_MASK_1; + + ompi_mtl_ofi.mpi_tag_mask = MTL_OFI_TAG_MASK_1; + ompi_mtl_ofi.num_bits_mpi_tag = MTL_OFI_TAG_BIT_COUNT_1; + + ompi_mtl_ofi.sync_send = MTL_OFI_SYNC_SEND_1; + ompi_mtl_ofi.sync_send_ack = MTL_OFI_SYNC_SEND_ACK_1; + ompi_mtl_ofi.sync_proto_mask = MTL_OFI_PROTO_MASK_1; + break; + case MTL_OFI_TAG_2: + ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << MTL_OFI_CID_BIT_COUNT_2 ) - 1); + ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_2 - 1)) - 1); + + ompi_mtl_ofi.source_rank_tag_mask = MTL_OFI_SOURCE_TAG_MASK_2; + ompi_mtl_ofi.num_bits_source_rank = MTL_OFI_SOURCE_BIT_COUNT_2; + ompi_mtl_ofi.source_rank_mask = MTL_OFI_SOURCE_MASK_2; + + ompi_mtl_ofi.mpi_tag_mask = MTL_OFI_TAG_MASK_2; + ompi_mtl_ofi.num_bits_mpi_tag = MTL_OFI_TAG_BIT_COUNT_2; + + ompi_mtl_ofi.sync_send = MTL_OFI_SYNC_SEND_2; + ompi_mtl_ofi.sync_send_ack = MTL_OFI_SYNC_SEND_ACK_2; + ompi_mtl_ofi.sync_proto_mask = MTL_OFI_PROTO_MASK_2; + break; + default: /* use FI_REMOTE_CQ_DATA */ + ompi_mtl_ofi.base.mtl_max_contextid = (int)((1ULL << MTL_OFI_CID_BIT_COUNT_DATA ) - 1); + ompi_mtl_ofi.base.mtl_max_tag = (int)((1ULL << (MTL_OFI_TAG_BIT_COUNT_DATA - 1)) - 1); + + ompi_mtl_ofi.mpi_tag_mask = MTL_OFI_TAG_MASK_DATA; + + ompi_mtl_ofi.sync_send = MTL_OFI_SYNC_SEND_DATA; + ompi_mtl_ofi.sync_send_ack = MTL_OFI_SYNC_SEND_ACK_DATA; + ompi_mtl_ofi.sync_proto_mask = MTL_OFI_PROTO_MASK_DATA; + } +} + static mca_mtl_base_module_t* ompi_mtl_ofi_component_init(bool enable_progress_threads, bool enable_mpi_threads) { int ret, fi_version; struct fi_info *hints; - struct fi_info *providers = NULL, *prov = NULL; + struct fi_info *providers = NULL; + struct fi_info *prov = NULL; + struct fi_info *prov_cq_data = NULL; struct fi_cq_attr cq_attr = {0}; struct fi_av_attr av_attr = {0}; char ep_name[FI_NAME_MAX] = {0}; @@ -411,6 +540,39 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, goto error; } + /** + * Select the format of the OFI tag + */ + if ((MTL_OFI_TAG_AUTO == ofi_tag_mode) || + (MTL_OFI_TAG_FULL == ofi_tag_mode)) { + ret = ompi_mtl_ofi_check_fi_remote_cq_data(fi_version, + hints, prov, + &prov_cq_data); + if (OMPI_SUCCESS != ret) { + goto error; + } else if (NULL == prov_cq_data) { + /* No support for FI_REMTOTE_CQ_DATA */ + fi_freeinfo(prov_cq_data); + ompi_mtl_ofi.fi_cq_data = false; + if (MTL_OFI_TAG_AUTO == ofi_tag_mode) { + /* Fallback to MTL_OFI_TAG_1 */ + ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_1); + } else { /* MTL_OFI_TAG_FULL */ + opal_output_verbose(1, ompi_mtl_base_framework.framework_output, + "%s:%d: OFI provider %s does not support FI_REMOTE_CQ_DATA\n", + __FILE__, __LINE__, prov->fabric_attr->prov_name); + goto error; + } + } else { + /* Use FI_REMTOTE_CQ_DATA */ + ompi_mtl_ofi.fi_cq_data = true; + prov = prov_cq_data; + ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_FULL); + } + } else { /* MTL_OFI_TAG_1 or MTL_OFI_TAG_2 */ + ompi_mtl_ofi.fi_cq_data = false; + ompi_mtl_ofi_define_tag_mode(ofi_tag_mode); + } /** * Open fabric @@ -503,7 +665,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, * Allocate memory for storing the CQ events read in OFI progress. */ ompi_mtl_ofi.progress_entries = calloc(ompi_mtl_ofi.ofi_progress_event_count, sizeof(struct fi_cq_tagged_entry)); - if (OPAL_UNLIKELY(!ompi_mtl_ofi.progress_entries)) { + if (NULL == ompi_mtl_ofi.progress_entries) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: alloc of CQ event storage failed: %s\n", __FILE__, __LINE__, strerror(errno)); @@ -614,6 +776,9 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, if (providers) { (void) fi_freeinfo(providers); } + if (prov_cq_data) { + (void) fi_freeinfo(prov_cq_data); + } if (hints) { (void) fi_freeinfo(hints); } diff --git a/ompi/mca/mtl/ofi/mtl_ofi_types.h b/ompi/mca/mtl/ofi/mtl_ofi_types.h index 0b6a1fcc715..abc587b2697 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_types.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2018 Intel, Inc. All rights reserved * * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ @@ -55,6 +55,21 @@ typedef struct mca_mtl_ofi_module_t { /** CQ event storage */ struct fi_cq_tagged_entry *progress_entries; + /** Use FI_REMOTE_CQ_DATA*/ + bool fi_cq_data; + + /** Info used to create the OFI tag **/ + unsigned long long source_rank_tag_mask; + int num_bits_source_rank; + unsigned long long source_rank_mask; + unsigned long long mpi_tag_mask; + int num_bits_mpi_tag; + + /** Synchronous protocol tag bits */ + unsigned long long sync_send; + unsigned long long sync_send_ack; + unsigned long long sync_proto_mask; + } mca_mtl_ofi_module_t; extern mca_mtl_ofi_module_t ompi_mtl_ofi; @@ -64,75 +79,165 @@ typedef struct mca_mtl_ofi_component_t { mca_mtl_base_component_2_0_0_t super; } mca_mtl_ofi_component_t; - -/* match/ignore bit manipulation - * - * 0 123 4567 01234567 01234567 01234567 01234567 01234567 01234567 01234567 - * | | | | - * | | context id | source | message tag - * ^| ^ | | | - * | | - * | +- protocol - * +---- ACK flag +/*OFI TAG: + * Define 3 different OFI tag distributions: + * 1) Support FI_REMOTE_CQ_DATA: No need for source rank in the tag + * 2) ofi_tag_1: fallback when no FI_REMOTE_CQ_DATA is supported + * 3) ofi_tag_2: Alternative tag when no FI_REMOTE_CQ_DATA is supported + * with more bits for the communicator ID. + * More details of the tags are in the README file (mtl_ofi_tag_mode). +*/ + +/* Support FI_REMOTE_CQ_DATA, send the source rank in the CQ data (4 Bytes is the minimum) + * 01234567 01234567 01234567 0123 4567 01234567 01234567 01234567 01234567 + * | | + * context_id |prot| message tag */ +#define MTL_OFI_PROTO_BIT_COUNT (4) + +#define MTL_OFI_CID_BIT_COUNT_DATA (28) +#define MTL_OFI_TAG_MASK_DATA (0x00000000FFFFFFFFULL) +#define MTL_OFI_TAG_BIT_COUNT_DATA (32) +#define MTL_OFI_PROTO_MASK_DATA (0x0000000F00000000ULL) +#define MTL_OFI_SYNC_SEND_DATA (0x0000000100000000ULL) +#define MTL_OFI_SYNC_SEND_ACK_DATA (0x0000000900000000ULL) + +/* Send tag with CQ_DATA */ +__opal_attribute_always_inline__ static inline uint64_t +mtl_ofi_create_send_tag_CQD(int comm_id, int tag) +{ + uint64_t match_bits = comm_id; + match_bits = (match_bits << (MTL_OFI_TAG_BIT_COUNT_DATA + + MTL_OFI_PROTO_BIT_COUNT)); + match_bits |= (tag & MTL_OFI_TAG_MASK_DATA); + return match_bits; +} + +/* Receive tag with CQ_DATA */ +__opal_attribute_always_inline__ static inline void +mtl_ofi_create_recv_tag_CQD(uint64_t *match_bits, uint64_t *mask_bits, + int comm_id, int tag) +{ + *mask_bits = ompi_mtl_ofi.sync_proto_mask; + *match_bits = (uint64_t) comm_id; + *match_bits = (*match_bits << (MTL_OFI_PROTO_BIT_COUNT + + MTL_OFI_TAG_BIT_COUNT_DATA)); + if (MPI_ANY_TAG == tag) { + /* Special negative tags are used for collective operations. + * MPI_ANY_TAG should not match these special tags. + * See ompi/mca/coll/base/coll_tags.h + */ + *mask_bits |= (ompi_mtl_ofi.mpi_tag_mask>>1); + } else { + *match_bits |= (ompi_mtl_ofi.mpi_tag_mask & tag); + } +} + +/* +* ofi_tag_1: fallback when no FI_REMOTE_CQ_DATA is supported +* +* 01234567 0123 4567 01234567 0123 4567 01234567 01234567 01234567 01234567 +* | | | +* Comm id | source |prot| message tag +*/ + +#define MTL_OFI_CID_BIT_COUNT_1 (12) +#define MTL_OFI_SOURCE_TAG_MASK_1 (0x000FFFF000000000ULL) +#define MTL_OFI_SOURCE_BIT_COUNT_1 (16) +#define MTL_OFI_SOURCE_MASK_1 (0x000000000000FFFFULL) +#define MTL_OFI_TAG_MASK_1 (0x00000000FFFFFFFFULL) +#define MTL_OFI_TAG_BIT_COUNT_1 (32) +#define MTL_OFI_PROTO_MASK_1 (0x0000000F00000000ULL) +#define MTL_OFI_SYNC_SEND_1 (0x0000000100000000ULL) +#define MTL_OFI_SYNC_SEND_ACK_1 (0x0000000900000000ULL) -#define MTL_OFI_PROTOCOL_HEADER_MASK (0xF000000000000000ULL) -#define MTL_OFI_PROTOCOL_MASK (0x7000000000000000ULL) -#define MTL_OFI_CONTEXT_MASK (0x0FFF000000000000ULL) -#define MTL_OFI_SOURCE_MASK (0x0000FFFF00000000ULL) -#define MTL_OFI_TAG_MASK (0x00000000FFFFFFFFULL) - -#define MTL_OFI_SYNC_SEND (0x1000000000000000ULL) -#define MTL_OFI_SYNC_SEND_ACK (0x9000000000000000ULL) - -/* send posting */ -#define MTL_OFI_SET_SEND_BITS(match_bits, contextid, source, tag, type) \ - { \ - match_bits = contextid; \ - match_bits = (match_bits << 16); \ - match_bits |= (uint64_t)source; \ - match_bits = (match_bits << 32); \ - match_bits |= (MTL_OFI_TAG_MASK & tag) | type; \ +/* +* ofi_tag_2: Alternative tag when no FI_REMOTE_CQ_DATA is supported +* +* 01234567 01234567 01234567 01234567 01234567 0123 4567 01234567 01234567 +* | | | +* Comm id | source |prot| message tag +*/ + +#define MTL_OFI_CID_BIT_COUNT_2 (24) +#define MTL_OFI_SOURCE_TAG_MASK_2 (0x000000FFFF000000ULL) +#define MTL_OFI_SOURCE_BIT_COUNT_2 (16) +#define MTL_OFI_SOURCE_MASK_2 (0x000000000000FFFFULL) +#define MTL_OFI_TAG_MASK_2 (0x00000000000FFFFFULL) +#define MTL_OFI_TAG_BIT_COUNT_2 (20) +#define MTL_OFI_PROTO_MASK_2 (0x0000000000F00000ULL) +#define MTL_OFI_SYNC_SEND_2 (0x0000000000100000ULL) +#define MTL_OFI_SYNC_SEND_ACK_2 (0x0000000000900000ULL) + +/* Send tag */ +__opal_attribute_always_inline__ static inline uint64_t +mtl_ofi_create_send_tag(int comm_id, int source, int tag) +{ + uint64_t match_bits = comm_id; + match_bits = (match_bits << ompi_mtl_ofi.num_bits_source_rank); + match_bits |= (uint64_t)(source & ompi_mtl_ofi.source_rank_mask); + match_bits = (match_bits << (ompi_mtl_ofi.num_bits_mpi_tag + + MTL_OFI_PROTO_BIT_COUNT)); + match_bits |= (tag & ompi_mtl_ofi.mpi_tag_mask); + return match_bits; +} + +/* Receive tag*/ +__opal_attribute_always_inline__ static inline void +mtl_ofi_create_recv_tag(uint64_t *match_bits, uint64_t *mask_bits, + int comm_id, int source, int tag) +{ + *mask_bits = ompi_mtl_ofi.sync_proto_mask; + *match_bits = comm_id; + *match_bits = (*match_bits << ompi_mtl_ofi.num_bits_source_rank); + + if (MPI_ANY_SOURCE == source) { + *match_bits = (*match_bits << (ompi_mtl_ofi.num_bits_mpi_tag + + MTL_OFI_PROTO_BIT_COUNT)); + *mask_bits |= ompi_mtl_ofi.source_rank_tag_mask; + } else { + *match_bits |= (uint64_t)(source & ompi_mtl_ofi.source_rank_mask); + *match_bits = (*match_bits << (ompi_mtl_ofi.num_bits_mpi_tag + + MTL_OFI_PROTO_BIT_COUNT)); } -/* receive posting */ -/* Special tags are used for collective operations. - * MPI_ANY_TAG should not match these special tags. - * See ompi/mca/coll/base/coll_tags.h - */ -#define MTL_OFI_SET_RECV_BITS(match_bits, mask_bits, contextid, source, tag) \ - { \ - match_bits = 0; \ - mask_bits = MTL_OFI_PROTOCOL_MASK; \ - \ - match_bits = contextid; \ - match_bits = (match_bits << 16); \ - \ - if (MPI_ANY_SOURCE == source) { \ - match_bits = (match_bits << 32); \ - mask_bits |= MTL_OFI_SOURCE_MASK; \ - } else { \ - match_bits |= (uint64_t)source; \ - match_bits = (match_bits << 32); \ - } \ - \ - if (MPI_ANY_TAG == tag) { \ - mask_bits |= 0x000000007FFFFFFFULL; \ - } else { \ - match_bits |= (MTL_OFI_TAG_MASK & tag); \ - } \ + if (MPI_ANY_TAG == tag) { + /* Special negative tags are used for collective operations. + * MPI_ANY_TAG should not match these special tags. + * See ompi/mca/coll/base/coll_tags.h + */ + *mask_bits |= (ompi_mtl_ofi.mpi_tag_mask>>1); + } else { + *match_bits |= (ompi_mtl_ofi.mpi_tag_mask & tag); } +} + +#define MTL_OFI_SET_SYNC_SEND(match_bits) \ + match_bits |= ompi_mtl_ofi.sync_send #define MTL_OFI_IS_SYNC_SEND(match_bits) \ - (MTL_OFI_SYNC_SEND == (MTL_OFI_PROTOCOL_HEADER_MASK & match_bits)) -#define MTL_OFI_IS_SYNC_SEND_ACK(match_bits) \ - (MTL_OFI_SYNC_SEND_ACK == (MTL_OFI_PROTOCOL_HEADER_MASK & match_bits)) + (ompi_mtl_ofi.sync_send == (ompi_mtl_ofi.sync_proto_mask & match_bits)) + +#define MTL_OFI_IS_SYNC_SEND_ACK(match_bits) \ + (ompi_mtl_ofi.sync_send_ack == (ompi_mtl_ofi.sync_proto_mask & match_bits)) #define MTL_OFI_GET_TAG(match_bits) \ - ((int)(match_bits & MTL_OFI_TAG_MASK)) -#define MTL_OFI_GET_SOURCE(match_bits) \ - ((int)((match_bits & MTL_OFI_SOURCE_MASK) >> 32)) + ((int)(match_bits & ompi_mtl_ofi.mpi_tag_mask)) + +__opal_attribute_always_inline__ static inline int +mtl_ofi_get_source(struct fi_cq_tagged_entry *wc) +{ + int src; + if (ompi_mtl_ofi.fi_cq_data) { + src = (int) wc->data; + } + else { + src = (int)((wc->tag >> (MTL_OFI_PROTO_BIT_COUNT + + ompi_mtl_ofi.num_bits_mpi_tag)) & ompi_mtl_ofi.source_rank_mask); + } + return src; +} END_C_DECLS #endif /* MTL_OFI_TYPES_H_HAS_BEEN_INCLUDED */