From 1099d05dddc1b06707a434a47892de0548473709 Mon Sep 17 00:00:00 2001 From: Franz Poeschel Date: Thu, 30 Nov 2023 15:35:52 +0100 Subject: [PATCH 01/50] First attempt at shm support --- source/adios2/toolkit/sst/dp/rdma_dp.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 311e79a7f5..3c989f0fe0 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -209,6 +209,13 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, hints->mode = FI_CONTEXT | FI_LOCAL_MR | FI_CONTEXT2 | FI_MSG_PREFIX | FI_ASYNC_IOV | FI_RX_CQ_DATA; hints->ep_attr->type = FI_EP_RDM; + { + char const *prov_name = "shm"; + size_t len = strlen(prov_name) + 1; + char *construct_prov_name = malloc(len); + memcpy(construct_prov_name, prov_name, len); + hints->fabric_attr->prov_name = construct_prov_name; + } uint32_t fi_version; #ifdef SST_HAVE_CRAY_CXI @@ -246,7 +253,7 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, hints->domain_attr->data_progress = FI_PROGRESS_AUTO; } #else - fi_version = FI_VERSION(1, 5); + fi_version = FI_VERSION(1, 18); // Alternatively, one could set mr_mode to // FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL @@ -256,9 +263,9 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, // The RDMA DP is able to deal with this appropriately, and does so right // before calling fi_fabric() further below in this function. // The main reason for keeping FI_MR_BASIC here is backward compatibility. - hints->domain_attr->mr_mode = FI_MR_BASIC; + hints->domain_attr->mr_mode = FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY; hints->domain_attr->control_progress = FI_PROGRESS_AUTO; - hints->domain_attr->data_progress = FI_PROGRESS_AUTO; + // hints->domain_attr->data_progress = FI_PROGRESS_AUTO; #endif /* @@ -294,6 +301,12 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, char *prov_name = info->fabric_attr->prov_name; char *domain_name = info->domain_attr->name; + if (info->tx_attr->inject_size > 0) + { + info = info->next; + continue; + } + if (ifname && strcmp(ifname, domain_name) == 0) { Svcs->verbose(CP_Stream, DPTraceVerbose, "using interface set by FABRIC_IFACE.\n"); @@ -460,7 +473,7 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, } av_attr.type = FI_AV_MAP; - av_attr.count = DP_AV_DEF_SIZE; + // av_attr.count = DP_AV_DEF_SIZE; av_attr.ep_per_node = 0; result = fi_av_open(fabric->domain, &av_attr, &fabric->av, fabric->ctx); if (result != FI_SUCCESS) @@ -1342,8 +1355,9 @@ static DP_WSR_Stream RdmaInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_S ReaderRollHandle = &ContactInfo->ReaderRollHandle; ReaderRollHandle->Block = calloc(readerCohortSize, sizeof(struct _RdmaBuffer)); + static uint64_t key = 12345; sst_fi_mr_reg(Svcs, WS_Stream->CP_Stream, Fabric->domain, ReaderRollHandle->Block, - readerCohortSize * sizeof(struct _RdmaBuffer), FI_REMOTE_WRITE, 0, 0, 0, + readerCohortSize * sizeof(struct _RdmaBuffer), FI_REMOTE_WRITE, 0, ++key, 0, &WSR_Stream->rrmr, Fabric->ctx, Fabric->signal, Fabric->info->domain_attr->mr_mode); ReaderRollHandle->Key = fi_mr_key(WSR_Stream->rrmr); @@ -2081,6 +2095,7 @@ static struct _CP_DP_Interface RdmaDPInterface = {0}; */ static int RdmaGetPriority(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params) { + return 100; struct fi_info *hints, *info, *originfo; char const *ifname; char *forkunsafe; From 86809ee94eb25a287625a8bb715af33afc5efc8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 8 Dec 2023 10:29:55 +0100 Subject: [PATCH 02/50] Keep mr_mode --- source/adios2/toolkit/sst/dp/rdma_dp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 3c989f0fe0..841056ee9d 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -415,9 +415,9 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, */ if (info->domain_attr->mr_mode != FI_MR_BASIC) { - info->domain_attr->mr_mode = FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL | - (FI_MR_ENDPOINT & info->domain_attr->mr_mode) | - (FI_MR_VIRT_ADDR & info->domain_attr->mr_mode); + // info->domain_attr->mr_mode = FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL | + // (FI_MR_ENDPOINT & info->domain_attr->mr_mode) | + // (FI_MR_VIRT_ADDR & info->domain_attr->mr_mode); fabric->mr_virt_addr = info->domain_attr->mr_mode & FI_MR_VIRT_ADDR ? 1 : 0; } else @@ -438,7 +438,6 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, Svcs->verbose(CP_Stream, DPCriticalVerbose, "copying the fabric info failed.\n"); return; } - Svcs->verbose(CP_Stream, DPTraceVerbose, "Fabric parameters to use at fabric initialization: %s\n", fi_tostr(fabric->info, FI_TYPE_INFO)); From 9c4e6e46f31fec7d6f50b49f4cd3edab619b3b13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 8 Dec 2023 11:44:27 +0100 Subject: [PATCH 03/50] Hardcode address length lol --- source/adios2/toolkit/sst/dp/rdma_dp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 841056ee9d..8227d16ec2 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -1079,7 +1079,7 @@ static DP_RS_Stream RdmaInitReader(CP_Services Svcs, void *CP_Stream, void **Rea return NULL; } - ContactInfo->Length = Fabric->info->src_addrlen; + ContactInfo->Length = 24; ContactInfo->Address = malloc(ContactInfo->Length); if (guard_fi_return( fi_getname((fid_t)Fabric->signal, ContactInfo->Address, &ContactInfo->Length), Svcs, @@ -1342,7 +1342,7 @@ static DP_WSR_Stream RdmaInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_S ContactInfo = calloc(1, sizeof(struct _RdmaWriterContactInfo)); ContactInfo->WS_Stream = WSR_Stream; - ContactInfo->Length = Fabric->info->src_addrlen; + ContactInfo->Length = 24; ContactInfo->Address = malloc(ContactInfo->Length); if (guard_fi_return( fi_getname((fid_t)Fabric->signal, ContactInfo->Address, &ContactInfo->Length), Svcs, From 7aafa456589451b1747f54b8df06dff9d56723fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 8 Dec 2023 11:44:38 +0100 Subject: [PATCH 04/50] Tracing output --- source/adios2/toolkit/sst/dp/rdma_dp.c | 36 ++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 8227d16ec2..f288b76ff1 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -113,6 +113,7 @@ int sst_fi_mr_reg( */ int guard_fi_return(int code, CP_Services Svcs, CManager cm, char const *msg) { + printf("[RDMA CALL guard_fi_return]\n"); if (code != FI_SUCCESS) { Svcs->verbose(cm, DPCriticalVerbose, "%s: %s (%lu)\n", msg, fi_strerror(code), code); @@ -198,6 +199,7 @@ static char const *get_preferred_domain(struct _SstParams *Params) static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, CP_Services Svcs, void *CP_Stream, char const *ifname) { + printf("[RDMA CALL init_fabric]\n"); struct fi_info *hints, *info, *originfo, *useinfo; struct fi_av_attr av_attr = {FI_AV_UNSPEC}; struct fi_cq_attr cq_attr = {0}; @@ -530,6 +532,7 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, static void fini_fabric(struct fabric_state *fabric, CP_Services Svcs, void *CP_Stream) { + printf("[RDMA CALL fini_fabric]\n"); int res; @@ -762,6 +765,7 @@ typedef struct _RdmaWriterContactInfo static TimestepList GetStep(Rdma_WS_Stream Stream, long Timestep) { + printf("[RDMA CALL GetStep]\n"); TimestepList Step; pthread_mutex_lock(&ts_mutex); @@ -779,6 +783,7 @@ static TimestepList GetStep(Rdma_WS_Stream Stream, long Timestep) static int get_cxi_auth_key_from_env(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params, struct cxi_auth_key *key, char **used_device) { + printf("[RDMA CALL get_cxi_auth_key_from_env]\n"); int vni, first_vni, second_vni, svc_id; // Just some safety against faulty strings in string processing. @@ -950,6 +955,7 @@ static int get_cxi_auth_key_from_env(CP_Services Svcs, void *CP_Stream, struct _ static int get_cxi_auth_key_from_writer(struct cxi_auth_key *key, attr_list WriterContact) { + printf("[RDMA CALL get_cxi_auth_key_from_writer]\n"); long vni; if (!get_long_attr(WriterContact, attr_atom_from_string("vni"), &vni)) { @@ -964,6 +970,7 @@ static DP_RS_Stream RdmaInitReader(CP_Services Svcs, void *CP_Stream, void **Rea struct _SstParams *Params, attr_list WriterContact, SstStats Stats) { + printf("[RDMA CALL RdmaInitReader]\n"); Rdma_RS_Stream Stream = malloc(sizeof(struct _Rdma_RS_Stream)); SMPI_Comm comm = Svcs->getMPIComm(CP_Stream); RdmaReaderContactInfo ContactInfo = malloc(sizeof(struct _RdmaReaderContactInfo)); @@ -1104,6 +1111,7 @@ static DP_RS_Stream RdmaInitReader(CP_Services Svcs, void *CP_Stream, void **Rea static void RdmaReadPatternLocked(CP_Services Svcs, DP_WSR_Stream WSRStream_v, long EffectiveTimestep) { + printf("[RDMA CALL RdmaReadPatternLocked]\n"); Rdma_WSR_Stream WSR_Stream = (Rdma_WSR_Stream)WSRStream_v; Rdma_WS_Stream WS_Stream = WSR_Stream->WS_Stream; @@ -1127,6 +1135,7 @@ static void RdmaReadPatternLocked(CP_Services Svcs, DP_WSR_Stream WSRStream_v, static void RdmaWritePatternLocked(CP_Services Svcs, DP_RS_Stream Stream_v, long EffectiveTimestep) { + printf("[RDMA CALL RdmaWritePatternLocked]\n"); Rdma_RS_Stream Stream = (Rdma_RS_Stream)Stream_v; if (Stream->PreloadAvail) @@ -1149,6 +1158,7 @@ static void RdmaWritePatternLocked(CP_Services Svcs, DP_RS_Stream Stream_v, long static DP_WS_Stream RdmaInitWriter(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params, attr_list DPAttrs, SstStats Stats) { + printf("[RDMA CALL RdmaInitWriter]\n"); Rdma_WS_Stream Stream = malloc(sizeof(struct _Rdma_WS_Stream)); SMPI_Comm comm = Svcs->getMPIComm(CP_Stream); char *PreloadEnv; @@ -1297,6 +1307,7 @@ static DP_WSR_Stream RdmaInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_S void **providedReaderInfo_v, void **WriterContactInfoPtr) { + printf("[RDMA CALL RdmaInitWriterPerReader]\n"); Rdma_WS_Stream WS_Stream = (Rdma_WS_Stream)WS_Stream_v; Rdma_WSR_Stream WSR_Stream = malloc(sizeof(*WSR_Stream)); FabricState Fabric = WS_Stream->Fabric; @@ -1382,6 +1393,7 @@ static void RdmaProvideWriterDataToReader(CP_Services Svcs, DP_RS_Stream RS_Stre int writerCohortSize, CP_PeerCohort PeerCohort, void **providedWriterInfo_v) { + printf("[RDMA CALL RdmaProvideWriterDataToReader]\n"); Rdma_RS_Stream RS_Stream = (Rdma_RS_Stream)RS_Stream_v; FabricState Fabric = RS_Stream->Fabric; RdmaWriterContactInfo *providedWriterInfo = (RdmaWriterContactInfo *)providedWriterInfo_v; @@ -1419,6 +1431,7 @@ static void RdmaProvideWriterDataToReader(CP_Services Svcs, DP_RS_Stream RS_Stre static void LogRequest(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, long Timestep, size_t Offset, size_t Length) { + printf("[RDMA CALL LogRequest]\n"); RdmaStepLogEntry *StepLog_p; RdmaStepLogEntry StepLog; RdmaBuffer LogEntry; @@ -1477,6 +1490,7 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo size_t Offset, size_t Length, void *Buffer, RdmaBufferHandle Info, RdmaCompletionHandle *ret_v) { + printf("[RDMA CALL PostRead]\n"); FabricState Fabric = RS_Stream->Fabric; fi_addr_t SrcAddress = RS_Stream->WriterAddr[Rank]; void *LocalDesc = NULL; @@ -1542,6 +1556,7 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo static RdmaBuffer GetRequest(Rdma_RS_Stream Stream, RdmaStepLogEntry StepLog, int Rank, size_t Offset, size_t Length) { + printf("[RDMA CALL GetRequest]\n"); RdmaRankReqLog RankLog = &StepLog->RankLog[Rank]; RdmaBuffer Req; @@ -1650,6 +1665,7 @@ static void *RdmaReadRemoteMemory(CP_Services Svcs, DP_RS_Stream Stream_v, int R static void RdmaNotifyConnFailure(CP_Services Svcs, DP_RS_Stream Stream_v, int FailedPeerRank) { + printf("[RDMA CALL RdmaNotifyConnFailure]\n"); /* DP_RS_Stream is the return from InitReader */ Rdma_RS_Stream Stream = (Rdma_RS_Stream)Stream_v; Svcs->verbose(Stream->CP_Stream, DPTraceVerbose, @@ -1663,6 +1679,7 @@ static void RdmaNotifyConnFailure(CP_Services Svcs, DP_RS_Stream Stream_v, int F */ static int DoPushWait(CP_Services Svcs, Rdma_RS_Stream Stream, RdmaCompletionHandle Handle) { + printf("[RDMA CALL DoPushWait]\n"); FabricState Fabric = Stream->Fabric; RdmaStepLogEntry StepLog = Stream->PreloadStepLog; RdmaRankReqLog RankLog; @@ -1748,6 +1765,7 @@ static int DoPushWait(CP_Services Svcs, Rdma_RS_Stream Stream, RdmaCompletionHan static int WaitForAnyPull(CP_Services Svcs, Rdma_RS_Stream Stream) { + printf("[RDMA CALL WaitForAnyPull]\n"); FabricState Fabric = Stream->Fabric; RdmaCompletionHandle Handle_t; struct fi_cq_data_entry CQEntry = {0}; @@ -1786,6 +1804,7 @@ static int WaitForAnyPull(CP_Services Svcs, Rdma_RS_Stream Stream) static int DoPullWait(CP_Services Svcs, Rdma_RS_Stream Stream, RdmaCompletionHandle Handle) { + printf("[RDMA CALL DoPullWait]\n"); while (Handle->Pending > 0) { if (WaitForAnyPull(Svcs, Stream) == 0) @@ -1800,6 +1819,7 @@ static int DoPullWait(CP_Services Svcs, Rdma_RS_Stream Stream, RdmaCompletionHan */ static int RdmaWaitForCompletion(CP_Services Svcs, void *Handle_v) { + printf("[RDMA CALL RdmaWaitForCompletion]\n"); RdmaCompletionHandle Handle = (RdmaCompletionHandle)Handle_v; Rdma_RS_Stream Stream = Handle->CPStream; @@ -1819,6 +1839,7 @@ static void RdmaProvideTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, struct struct _SstData *LocalMetadata, long Timestep, void **TimestepInfoPtr) { + printf("[RDMA CALL RdmaProvideTimestep]\n"); Rdma_WS_Stream Stream = (Rdma_WS_Stream)Stream_v; TimestepList Entry = malloc(sizeof(struct _TimestepEntry)); RdmaBufferHandle Info = malloc(sizeof(struct _RdmaBufferHandle)); @@ -1863,6 +1884,7 @@ static void RdmaProvideTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, struct static void RdmaReleaseTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, long Timestep) { + printf("[RDMA CALL RdmaReleaseTimestep]\n"); Rdma_WS_Stream Stream = (Rdma_WS_Stream)Stream_v; TimestepList *List = &Stream->Timesteps; TimestepList ReleaseTSL; @@ -1905,6 +1927,7 @@ static void RdmaReleaseTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, long Ti static void RdmaDestroyRankReqLog(Rdma_RS_Stream RS_Stream, RdmaRankReqLog RankReqLog) { + printf("[RDMA CALL RdmaDestroyRankReqLog]\n"); int i; for (i = 0; i < RS_Stream->WriterCohortSize; i++) @@ -1919,6 +1942,7 @@ static void RdmaDestroyRankReqLog(Rdma_RS_Stream RS_Stream, RdmaRankReqLog RankR static void RdmaDestroyReader(CP_Services Svcs, DP_RS_Stream RS_Stream_v) { + printf("[RDMA CALL RdmaDestroyReader]\n"); Rdma_RS_Stream RS_Stream = (Rdma_RS_Stream)RS_Stream_v; RdmaStepLogEntry StepLog = RS_Stream->StepLog; RdmaStepLogEntry tStepLog; @@ -1958,6 +1982,7 @@ static void RdmaDestroyReader(CP_Services Svcs, DP_RS_Stream RS_Stream_v) static void RdmaDestroyWriterPerReader(CP_Services Svcs, DP_WSR_Stream WSR_Stream_v) { + printf("[RDMA CALL RdmaDestroyWriterPerReader]\n"); Rdma_WSR_Stream WSR_Stream = {0}; memcpy(&WSR_Stream, &WSR_Stream_v, sizeof(Rdma_WSR_Stream)); Rdma_WS_Stream WS_Stream = WSR_Stream->WS_Stream; @@ -2021,6 +2046,7 @@ static FMStructDescRec RdmaBufferHandleStructs[] = { static void RdmaDestroyWriter(CP_Services Svcs, DP_WS_Stream WS_Stream_v) { + printf("[RDMA CALL RdmaDestroyWriter]\n"); Rdma_WS_Stream WS_Stream = (Rdma_WS_Stream)WS_Stream_v; long Timestep; #ifdef SST_HAVE_CRAY_DRC @@ -2094,6 +2120,7 @@ static struct _CP_DP_Interface RdmaDPInterface = {0}; */ static int RdmaGetPriority(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params) { + printf("[RDMA CALL RdmaGetPriority]\n"); return 100; struct fi_info *hints, *info, *originfo; char const *ifname; @@ -2213,11 +2240,13 @@ static int RdmaGetPriority(CP_Services Svcs, void *CP_Stream, struct _SstParams */ static void RdmaUnGetPriority(CP_Services Svcs, void *CP_Stream) { + printf("[RDMA CALL RdmaUnGetPriority]\n"); Svcs->verbose(CP_Stream, DPPerStepVerbose, "RDMA Dataplane unloading\n"); } static void PushData(CP_Services Svcs, Rdma_WSR_Stream Stream, TimestepList Step, int BufferSlot) { + printf("[RDMA CALL PushData]\n"); Rdma_WS_Stream WS_Stream = Stream->WS_Stream; FabricState Fabric = WS_Stream->Fabric; RdmaRankReqLog RankReq = Stream->PreloadReq; @@ -2269,6 +2298,7 @@ static void PushData(CP_Services Svcs, Rdma_WSR_Stream Stream, TimestepList Step static void RdmaReaderRegisterTimestep(CP_Services Svcs, DP_WSR_Stream WSRStream_v, long Timestep, SstPreloadModeType PreloadMode) { + printf("[RDMA CALL RdmaReaderRegisterTimestep]\n"); Rdma_WSR_Stream WSR_Stream = (Rdma_WSR_Stream)WSRStream_v; Rdma_WS_Stream WS_Stream = WSR_Stream->WS_Stream; TimestepList Step; @@ -2296,6 +2326,7 @@ static void RdmaReaderRegisterTimestep(CP_Services Svcs, DP_WSR_Stream WSRStream static void PostPreload(CP_Services Svcs, Rdma_RS_Stream Stream, long Timestep) { + printf("[RDMA CALL PostPreload]\n"); RdmaStepLogEntry StepLog; FabricState Fabric = Stream->Fabric; RdmaBuffer PreloadBuffer = &Stream->PreloadBuffer; @@ -2463,6 +2494,7 @@ static void PostPreload(CP_Services Svcs, Rdma_RS_Stream Stream, long Timestep) static void RdmaTimestepArrived(CP_Services Svcs, DP_RS_Stream Stream_v, long Timestep, SstPreloadModeType PreloadMode) { + printf("[RDMA CALL RdmaTimestepArrived]\n"); Rdma_RS_Stream Stream = (Rdma_RS_Stream)Stream_v; Svcs->verbose(Stream->CP_Stream, DPTraceVerbose, "%s with Timestep = %li, PreloadMode = %d\n", @@ -2490,6 +2522,7 @@ static void RdmaTimestepArrived(CP_Services Svcs, DP_RS_Stream Stream_v, long Ti static void RdmaReaderReleaseTimestep(CP_Services Svcs, DP_RS_Stream Stream_v, long Timestep) { + printf("[RDMA CALL RdmaReaderReleaseTimestep]\n"); Rdma_RS_Stream Stream = (Rdma_RS_Stream)Stream_v; pthread_mutex_lock(&ts_mutex); @@ -2507,6 +2540,7 @@ static void RdmaReaderReleaseTimestep(CP_Services Svcs, DP_RS_Stream Stream_v, l static void PullSelection(CP_Services Svcs, Rdma_WSR_Stream Stream) { + printf("[RDMA CALL PullSelection]\n"); Rdma_WS_Stream WS_Stream = Stream->WS_Stream; FabricState Fabric = WS_Stream->Fabric; RdmaBuffer ReaderRoll = (RdmaBuffer)Stream->ReaderRoll->Handle.Block; @@ -2602,6 +2636,7 @@ static void PullSelection(CP_Services Svcs, Rdma_WSR_Stream Stream) static void CompletePush(CP_Services Svcs, Rdma_WSR_Stream Stream, TimestepList Step) { + printf("[RDMA CALL CompletePush]\n"); Rdma_WS_Stream WS_Stream = Stream->WS_Stream; FabricState Fabric = WS_Stream->Fabric; TimestepList CQStep; @@ -2660,6 +2695,7 @@ static void CompletePush(CP_Services Svcs, Rdma_WSR_Stream Stream, TimestepList static void RdmaReleaseTimestepPerReader(CP_Services Svcs, DP_WSR_Stream Stream_v, long Timestep) { + printf("[RDMA CALL RdmaReleaseTimestepPerReader]\n"); Rdma_WSR_Stream Stream = (Rdma_WSR_Stream)Stream_v; Rdma_WS_Stream WS_Stream = Stream->WS_Stream; TimestepList Step = GetStep(WS_Stream, Timestep); From 80904b85f6b6949c5178f59186ed59cffdbcf7c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 8 Dec 2023 11:45:43 +0100 Subject: [PATCH 05/50] Revert "Hardcode address length lol" This reverts commit 5766f5a55e2310f65a6048b4f2fa25c0852582e9. --- source/adios2/toolkit/sst/dp/rdma_dp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index f288b76ff1..4a971a7e5a 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -1086,7 +1086,7 @@ static DP_RS_Stream RdmaInitReader(CP_Services Svcs, void *CP_Stream, void **Rea return NULL; } - ContactInfo->Length = 24; + ContactInfo->Length = Fabric->info->src_addrlen; ContactInfo->Address = malloc(ContactInfo->Length); if (guard_fi_return( fi_getname((fid_t)Fabric->signal, ContactInfo->Address, &ContactInfo->Length), Svcs, @@ -1353,7 +1353,7 @@ static DP_WSR_Stream RdmaInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_S ContactInfo = calloc(1, sizeof(struct _RdmaWriterContactInfo)); ContactInfo->WS_Stream = WSR_Stream; - ContactInfo->Length = 24; + ContactInfo->Length = Fabric->info->src_addrlen; ContactInfo->Address = malloc(ContactInfo->Length); if (guard_fi_return( fi_getname((fid_t)Fabric->signal, ContactInfo->Address, &ContactInfo->Length), Svcs, From 1cfb02b3607dbaf73909b3cb6093ddee5417528d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 8 Dec 2023 11:53:12 +0100 Subject: [PATCH 06/50] Guard against too small address buffers --- source/adios2/toolkit/sst/dp/rdma_dp.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 4a971a7e5a..698c7eb4af 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -1088,9 +1088,15 @@ static DP_RS_Stream RdmaInitReader(CP_Services Svcs, void *CP_Stream, void **Rea ContactInfo->Length = Fabric->info->src_addrlen; ContactInfo->Address = malloc(ContactInfo->Length); - if (guard_fi_return( - fi_getname((fid_t)Fabric->signal, ContactInfo->Address, &ContactInfo->Length), Svcs, - CP_Stream, "[RdmaInitReader] fi_getname() failed with:") != FI_SUCCESS) + int error_code = fi_getname((fid_t)Fabric->signal, ContactInfo->Address, &ContactInfo->Length); + if (error_code == -FI_ETOOSMALL) + { + // Try again, fabric info might have under-reported the address length + ContactInfo->Address = realloc(ContactInfo->Address, ContactInfo->Length); + error_code = fi_getname((fid_t)Fabric->signal, ContactInfo->Address, &ContactInfo->Length); + } + if (guard_fi_return(error_code, Svcs, CP_Stream, + "[RdmaInitWriterPerReader] fi_getname() failed with") != FI_SUCCESS) { return NULL; } @@ -1355,10 +1361,15 @@ static DP_WSR_Stream RdmaInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_S ContactInfo->Length = Fabric->info->src_addrlen; ContactInfo->Address = malloc(ContactInfo->Length); - if (guard_fi_return( - fi_getname((fid_t)Fabric->signal, ContactInfo->Address, &ContactInfo->Length), Svcs, - WS_Stream->CP_Stream, - "[RdmaInitWriterPerReader] fi_getname() failed with") != FI_SUCCESS) + int error_code = fi_getname((fid_t)Fabric->signal, ContactInfo->Address, &ContactInfo->Length); + if (error_code == -FI_ETOOSMALL) + { + // Try again, fabric info might have under-reported the address length + ContactInfo->Address = realloc(ContactInfo->Address, ContactInfo->Length); + error_code = fi_getname((fid_t)Fabric->signal, ContactInfo->Address, &ContactInfo->Length); + } + if (guard_fi_return(error_code, Svcs, WS_Stream->CP_Stream, + "[RdmaInitWriterPerReader] fi_getname() failed with") != FI_SUCCESS) { return NULL; } From e6f817962e3137ad64d5eaca9422f3323825a69c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 8 Dec 2023 17:35:27 +0100 Subject: [PATCH 07/50] some stuff --- source/adios2/toolkit/sst/dp/rdma_dp.c | 29 +++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 698c7eb4af..2f86cacfc0 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -303,11 +303,11 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, char *prov_name = info->fabric_attr->prov_name; char *domain_name = info->domain_attr->name; - if (info->tx_attr->inject_size > 0) - { - info = info->next; - continue; - } + // if (info->tx_attr->inject_size > 0) + // { + // info = info->next; + // continue; + // } if (ifname && strcmp(ifname, domain_name) == 0) { @@ -1100,6 +1100,10 @@ static DP_RS_Stream RdmaInitReader(CP_Services Svcs, void *CP_Stream, void **Rea { return NULL; } + if (Stream->Fabric->info->addr_format == FI_ADDR_STR) + { + printf("Reader address: %s\n", (char const *)ContactInfo->Address); + } Stream->PreloadStep = -1; Stream->ContactInfo = ContactInfo; @@ -1373,15 +1377,19 @@ static DP_WSR_Stream RdmaInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_S { return NULL; } + if (Fabric->info->addr_format == FI_ADDR_STR) + { + printf("Writer address: %s\n", (char const *)ContactInfo->Address); + } ReaderRollHandle = &ContactInfo->ReaderRollHandle; ReaderRollHandle->Block = calloc(readerCohortSize, sizeof(struct _RdmaBuffer)); - static uint64_t key = 12345; sst_fi_mr_reg(Svcs, WS_Stream->CP_Stream, Fabric->domain, ReaderRollHandle->Block, - readerCohortSize * sizeof(struct _RdmaBuffer), FI_REMOTE_WRITE, 0, ++key, 0, + readerCohortSize * sizeof(struct _RdmaBuffer), FI_REMOTE_WRITE, 0, 0, 0, &WSR_Stream->rrmr, Fabric->ctx, Fabric->signal, Fabric->info->domain_attr->mr_mode); ReaderRollHandle->Key = fi_mr_key(WSR_Stream->rrmr); + printf("Key: %lu\n", ReaderRollHandle->Key); WSR_Stream->WriterContactInfo = ContactInfo; @@ -1543,8 +1551,15 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo do { + printf("Going into fi_read()\n"); rc = fi_read(Fabric->signal, Buffer, Length, LocalDesc, SrcAddress, (uint64_t)Addr, Info->Key, ret); + // if(rc == -EAGAIN) + // { + // struct fi_cq_data_entry CQEntry = {0}; + // ssize_t sq_rc; + // sq_rc = fi_cq_sread(Fabric->cq_signal, (void *)(&CQEntry), 1, NULL, -1); + // } } while (rc == -EAGAIN); if (rc != 0) From 674dfe407c23af4ef31b7c36e4879297e5ee4062 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 8 Dec 2023 18:54:36 +0100 Subject: [PATCH 08/50] Use manual progress via thread --- source/adios2/toolkit/sst/dp/rdma_dp.c | 213 +++++++++++++++++-------- 1 file changed, 144 insertions(+), 69 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 2f86cacfc0..749ae8cdcd 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -8,11 +8,15 @@ #include #include "adios2/common/ADIOSConfig.h" +#include "rdma/fi_eq.h" +#include "rdma/fi_errno.h" #include #include #include +#include + #include #include #include @@ -121,6 +125,96 @@ int guard_fi_return(int code, CP_Services Svcs, CManager cm, char const *msg) return code; } +struct cq_event_list +{ + struct fi_cq_data_entry *value; + struct cq_event_list *next; +}; + +struct cq_manual_progress +{ + struct fid_cq *cq_signal; + pthread_mutex_t cq_event_list_mutex; + struct cq_event_list *cq_event_list; + + CP_Services Svcs; + void *Stream; + int do_continue; +}; + +void cq_manual_progress_push(struct cq_manual_progress *self, struct cq_event_list *item) +{ + pthread_mutex_lock(&self->cq_event_list_mutex); + if (!self->cq_event_list) + { + self->cq_event_list = item; + } + else + { + struct cq_event_list *head = self->cq_event_list; + while (head->next) + { + head = head->next; + } + head->next = item; + } + pthread_mutex_unlock(&self->cq_event_list_mutex); +} + +struct fi_cq_data_entry *cq_manual_progress_pop(struct cq_manual_progress *self) +{ + struct fi_cq_data_entry *res; + pthread_mutex_lock(&self->cq_event_list_mutex); + if (!self->cq_event_list) + { + res = NULL; + } + else + { + struct cq_event_list *head = self->cq_event_list; + res = head->value; + self->cq_event_list = head->next; + free(head); + } + pthread_mutex_unlock(&self->cq_event_list_mutex); + return res; +} + +static void *make_progress(void *params_) +{ + struct cq_manual_progress *params = (struct cq_manual_progress *)params_; + struct fi_cq_data_entry *CQEntry = malloc(sizeof(struct fi_cq_data_entry)); + while (params->do_continue) + { + printf("~~~~~~~~a little bit of progress?\n"); + ssize_t rc = fi_cq_read(params->cq_signal, (void *)(&CQEntry), 1); + if (rc < 1) + { + struct fi_cq_err_entry error; + fi_cq_readerr(params->cq_signal, &error, 0); + if (error.err != -FI_SUCCESS) + { + params->Svcs->verbose( + params->Stream, DPCriticalVerbose, + "[PullSelection] no completion event (%d (%s - %s)).\n", rc, + fi_strerror(error.err), + fi_cq_strerror(params->cq_signal, error.err, error.err_data, NULL, error.len)); + } + sleep(5); + } + else + { + printf("GOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOT COMPLETION\n"); + struct cq_event_list *next_item = malloc(sizeof(struct cq_event_list)); + next_item->value = CQEntry; + next_item->next = NULL; + cq_manual_progress_push(params, next_item); + CQEntry = malloc(sizeof(struct fi_cq_data_entry)); + } + } + return NULL; +} + struct fabric_state { struct fi_context *ctx; @@ -146,8 +240,26 @@ struct fabric_state uint32_t credential; struct fi_gni_auth_key *auth_key; #endif /* SST_HAVE_CRAY_DRC */ + struct cq_manual_progress cq_manual_progress; + pthread_t pthread_id; }; +void cq_read(struct fabric_state *fabric, struct fi_cq_data_entry *CQEntry) +{ + while (1) + { + struct fi_cq_data_entry *res = cq_manual_progress_pop(&fabric->cq_manual_progress); + if (res == NULL) + { + sleep(5); + continue; + } + memcpy(CQEntry, res, sizeof(struct fi_cq_data_entry)); + free(res); + return; + } +} + /* * Some conventions: * `RS` indicates a reader-side item. @@ -528,12 +640,38 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, } fi_freeinfo(originfo); + + fabric->cq_manual_progress.cq_signal = fabric->cq_signal; + if (pthread_mutex_init(&fabric->cq_manual_progress.cq_event_list_mutex, NULL) != 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not init mutex.\n"); + return; + } + fabric->cq_manual_progress.cq_event_list = NULL; + fabric->cq_manual_progress.Svcs = Svcs; + fabric->cq_manual_progress.Stream = CP_Stream; + fabric->cq_manual_progress.do_continue = 1; + + if (pthread_create(&fabric->pthread_id, NULL, &make_progress, &fabric->cq_manual_progress) != 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); + return; + } } static void fini_fabric(struct fabric_state *fabric, CP_Services Svcs, void *CP_Stream) { printf("[RDMA CALL fini_fabric]\n"); + fabric->cq_manual_progress.do_continue = 0; + // free other stuff + + if (pthread_join(fabric->pthread_id, NULL) != 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not join thread.\n"); + return; + } + int res; do @@ -1716,21 +1854,8 @@ static int DoPushWait(CP_Services Svcs, Rdma_RS_Stream Stream, RdmaCompletionHan while (Handle->Pending > 0) { - ssize_t rc; - rc = fi_cq_sread(Fabric->cq_signal, (void *)(&CQEntry), 1, NULL, -1); - if (rc < 1) - { - struct fi_cq_err_entry error; - fi_cq_readerr(Fabric->cq_signal, &error, 0); - Svcs->verbose( - Stream->CP_Stream, DPCriticalVerbose, - "failure while waiting for completions inside " - "DoPushWait() (%d (%s - %s)).\n", - rc, fi_strerror(error.err), - fi_cq_strerror(Fabric->cq_signal, error.err, error.err_data, NULL, error.len)); - return 0; - } - else if (CQEntry.flags & FI_REMOTE_CQ_DATA) + cq_read(Fabric, &CQEntry); + if (CQEntry.flags & FI_REMOTE_CQ_DATA) { BufferSlot = CQEntry.data >> 31; WRidx = (CQEntry.data >> 20) & 0x3FF; @@ -1796,21 +1921,7 @@ static int WaitForAnyPull(CP_Services Svcs, Rdma_RS_Stream Stream) RdmaCompletionHandle Handle_t; struct fi_cq_data_entry CQEntry = {0}; - ssize_t rc; - rc = fi_cq_sread(Fabric->cq_signal, (void *)(&CQEntry), 1, NULL, -1); - if (rc < 1) - { - struct fi_cq_err_entry error; - fi_cq_readerr(Fabric->cq_signal, &error, 0); - Svcs->verbose( - Stream->CP_Stream, DPCriticalVerbose, - "failure while waiting for completions inside " - "WaitForAnyPull() (%d (%s - %s)).\n", - rc, fi_strerror(error.err), - fi_cq_strerror(Fabric->cq_signal, error.err, error.err_data, NULL, error.len)); - return 0; - } - else + cq_read(Fabric, &CQEntry); { Svcs->verbose(Stream->CP_Stream, DPTraceVerbose, "got completion for request with handle %p (flags %li).\n", @@ -2484,19 +2595,7 @@ static void PostPreload(CP_Services Svcs, Rdma_RS_Stream Stream, long Timestep) while (WRidx > 0) { - ssize_t rc = fi_cq_sread(Fabric->cq_signal, (void *)(&CQEntry), 1, NULL, -1); - if (rc < 1) - { - struct fi_cq_err_entry error; - fi_cq_readerr(Fabric->cq_signal, &error, 0); - Svcs->verbose( - Stream->CP_Stream, DPCriticalVerbose, - "[PostPreload] failure while waiting for completions " - "(%d (%s - %s)).\n", - rc, fi_strerror(error.err), - fi_cq_strerror(Fabric->cq_signal, error.err, error.err_data, NULL, error.len)); - return; - } + cq_read(Fabric, &CQEntry); CQBuffer = CQEntry.op_context; if (CQBuffer >= SendBuffer && CQBuffer < (SendBuffer + StepLog->WRanks)) { @@ -2626,19 +2725,7 @@ static void PullSelection(CP_Services Svcs, Rdma_WSR_Stream Stream) RankReq = Stream->PreloadReq; while (RankReq) { - ssize_t rc = fi_cq_sread(Fabric->cq_signal, (void *)(&CQEntry), 1, NULL, -1); - if (rc < 1) - { - struct fi_cq_err_entry error; - fi_cq_readerr(Fabric->cq_signal, &error, 0); - Svcs->verbose( - WS_Stream->CP_Stream, DPCriticalVerbose, - "[PullSelection] failure while waiting for completions " - "(%d (%s - %s)).\n", - rc, fi_strerror(error.err), - fi_cq_strerror(Fabric->cq_signal, error.err, error.err_data, NULL, error.len)); - return; - } + cq_read(Fabric, &CQEntry); CQRankReq = CQEntry.op_context; if (CQEntry.flags & FI_READ) { @@ -2671,19 +2758,7 @@ static void CompletePush(CP_Services Svcs, Rdma_WSR_Stream Stream, TimestepList while (Step->OutstandingWrites > 0) { - ssize_t rc = fi_cq_sread(Fabric->cq_signal, (void *)(&CQEntry), 1, NULL, -1); - if (rc < 1) - { - struct fi_cq_err_entry error; - fi_cq_readerr(Fabric->cq_signal, &error, 0); - Svcs->verbose( - WS_Stream->CP_Stream, DPCriticalVerbose, - "[CompletePush] failure while waiting for completions " - "(%d (%s - %s)).\n", - rc, fi_strerror(error.err), - fi_cq_strerror(Fabric->cq_signal, error.err, error.err_data, NULL, error.len)); - return; - } + cq_read(Fabric, &CQEntry); if (CQEntry.flags & FI_WRITE) { CQTimestep = (long)CQEntry.op_context; From 15be730dc16374db93300f841e18d73d278d50e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 8 Dec 2023 19:39:05 +0100 Subject: [PATCH 09/50] Fix wrong memory access mode? --- source/adios2/toolkit/sst/dp/rdma_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 749ae8cdcd..04b3c173e1 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -1523,7 +1523,7 @@ static DP_WSR_Stream RdmaInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_S ReaderRollHandle = &ContactInfo->ReaderRollHandle; ReaderRollHandle->Block = calloc(readerCohortSize, sizeof(struct _RdmaBuffer)); sst_fi_mr_reg(Svcs, WS_Stream->CP_Stream, Fabric->domain, ReaderRollHandle->Block, - readerCohortSize * sizeof(struct _RdmaBuffer), FI_REMOTE_WRITE, 0, 0, 0, + readerCohortSize * sizeof(struct _RdmaBuffer), FI_REMOTE_READ, 0, 0, 0, &WSR_Stream->rrmr, Fabric->ctx, Fabric->signal, Fabric->info->domain_attr->mr_mode); ReaderRollHandle->Key = fi_mr_key(WSR_Stream->rrmr); From 744335677adc876f3ca875e4dc89bed6b9597971 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 11 Dec 2023 15:08:12 +0100 Subject: [PATCH 10/50] Tentatively working --- source/adios2/toolkit/sst/dp/rdma_dp.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 04b3c173e1..bfc8b52dc7 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -78,6 +78,7 @@ int sst_fi_mr_reg( /* additional parameters for binding the mr to the endpoint*/ struct fid_ep *endpoint, int mr_mode) { + printf("Registering from %p to %p\n", buf, buf + len); int res = fi_mr_reg(domain, buf, len, acs, offset, requested_key, flags, mr, context); int is_mr_endpoint = (mr_mode & FI_MR_ENDPOINT) != 0; if (!is_mr_endpoint) @@ -187,7 +188,7 @@ static void *make_progress(void *params_) while (params->do_continue) { printf("~~~~~~~~a little bit of progress?\n"); - ssize_t rc = fi_cq_read(params->cq_signal, (void *)(&CQEntry), 1); + ssize_t rc = fi_cq_read(params->cq_signal, (void *)CQEntry, 1); if (rc < 1) { struct fi_cq_err_entry error; @@ -1522,11 +1523,11 @@ static DP_WSR_Stream RdmaInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_S ReaderRollHandle = &ContactInfo->ReaderRollHandle; ReaderRollHandle->Block = calloc(readerCohortSize, sizeof(struct _RdmaBuffer)); - sst_fi_mr_reg(Svcs, WS_Stream->CP_Stream, Fabric->domain, ReaderRollHandle->Block, - readerCohortSize * sizeof(struct _RdmaBuffer), FI_REMOTE_READ, 0, 0, 0, - &WSR_Stream->rrmr, Fabric->ctx, Fabric->signal, - Fabric->info->domain_attr->mr_mode); - ReaderRollHandle->Key = fi_mr_key(WSR_Stream->rrmr); + // sst_fi_mr_reg(Svcs, WS_Stream->CP_Stream, Fabric->domain, ReaderRollHandle->Block, + // readerCohortSize * sizeof(struct _RdmaBuffer), FI_REMOTE_READ, 0, 0, 0, + // &WSR_Stream->rrmr, Fabric->ctx, Fabric->signal, + // Fabric->info->domain_attr->mr_mode); + ReaderRollHandle->Key = 0; //fi_mr_key(WSR_Stream->rrmr); printf("Key: %lu\n", ReaderRollHandle->Key); WSR_Stream->WriterContactInfo = ContactInfo; @@ -1689,7 +1690,8 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo do { - printf("Going into fi_read()\n"); + printf("Going into fi_read() from %p (= %p + %lu) to %p\n", Addr, Info->Block, Offset, + Addr + Length); rc = fi_read(Fabric->signal, Buffer, Length, LocalDesc, SrcAddress, (uint64_t)Addr, Info->Key, ret); // if(rc == -EAGAIN) From 3f37d28757faf0f0189e8cae49a0019fcb7c9d0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 11 Dec 2023 15:21:34 +0100 Subject: [PATCH 11/50] Fixes --- source/adios2/toolkit/sst/dp/rdma_dp.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index bfc8b52dc7..05f07dd4fb 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -191,7 +191,7 @@ static void *make_progress(void *params_) ssize_t rc = fi_cq_read(params->cq_signal, (void *)CQEntry, 1); if (rc < 1) { - struct fi_cq_err_entry error; + struct fi_cq_err_entry error = {.err = 0}; fi_cq_readerr(params->cq_signal, &error, 0); if (error.err != -FI_SUCCESS) { @@ -530,9 +530,9 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, */ if (info->domain_attr->mr_mode != FI_MR_BASIC) { - // info->domain_attr->mr_mode = FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL | - // (FI_MR_ENDPOINT & info->domain_attr->mr_mode) | - // (FI_MR_VIRT_ADDR & info->domain_attr->mr_mode); + info->domain_attr->mr_mode = FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL | + (FI_MR_ENDPOINT & info->domain_attr->mr_mode) | + (FI_MR_VIRT_ADDR & info->domain_attr->mr_mode); fabric->mr_virt_addr = info->domain_attr->mr_mode & FI_MR_VIRT_ADDR ? 1 : 0; } else @@ -1523,11 +1523,11 @@ static DP_WSR_Stream RdmaInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_S ReaderRollHandle = &ContactInfo->ReaderRollHandle; ReaderRollHandle->Block = calloc(readerCohortSize, sizeof(struct _RdmaBuffer)); - // sst_fi_mr_reg(Svcs, WS_Stream->CP_Stream, Fabric->domain, ReaderRollHandle->Block, - // readerCohortSize * sizeof(struct _RdmaBuffer), FI_REMOTE_READ, 0, 0, 0, - // &WSR_Stream->rrmr, Fabric->ctx, Fabric->signal, - // Fabric->info->domain_attr->mr_mode); - ReaderRollHandle->Key = 0; //fi_mr_key(WSR_Stream->rrmr); + sst_fi_mr_reg(Svcs, WS_Stream->CP_Stream, Fabric->domain, ReaderRollHandle->Block, + readerCohortSize * sizeof(struct _RdmaBuffer), FI_REMOTE_WRITE, 0, 0, 0, + &WSR_Stream->rrmr, Fabric->ctx, Fabric->signal, + Fabric->info->domain_attr->mr_mode); + ReaderRollHandle->Key = fi_mr_key(WSR_Stream->rrmr); printf("Key: %lu\n", ReaderRollHandle->Key); WSR_Stream->WriterContactInfo = ContactInfo; From 37d73372d6c1abac13adc7d0a992d2c14cb744d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 11 Dec 2023 15:25:56 +0100 Subject: [PATCH 12/50] Bit better backoff for background thread --- source/adios2/toolkit/sst/dp/rdma_dp.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 05f07dd4fb..800910b3ed 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -181,10 +181,15 @@ struct fi_cq_data_entry *cq_manual_progress_pop(struct cq_manual_progress *self) return res; } +#define SST_BACKOFF_SECONDS_MAX 5 + static void *make_progress(void *params_) { struct cq_manual_progress *params = (struct cq_manual_progress *)params_; struct fi_cq_data_entry *CQEntry = malloc(sizeof(struct fi_cq_data_entry)); + + unsigned int current_backoff_seconds = 0; + while (params->do_continue) { printf("~~~~~~~~a little bit of progress?\n"); @@ -201,7 +206,11 @@ static void *make_progress(void *params_) fi_strerror(error.err), fi_cq_strerror(params->cq_signal, error.err, error.err_data, NULL, error.len)); } - sleep(5); + sleep(current_backoff_seconds); + if(current_backoff_seconds < SST_BACKOFF_SECONDS_MAX) + { + ++current_backoff_seconds; + } } else { @@ -211,6 +220,7 @@ static void *make_progress(void *params_) next_item->next = NULL; cq_manual_progress_push(params, next_item); CQEntry = malloc(sizeof(struct fi_cq_data_entry)); + current_backoff_seconds = 0; } } return NULL; @@ -247,12 +257,17 @@ struct fabric_state void cq_read(struct fabric_state *fabric, struct fi_cq_data_entry *CQEntry) { + unsigned int current_backoff_seconds = 0; while (1) { struct fi_cq_data_entry *res = cq_manual_progress_pop(&fabric->cq_manual_progress); if (res == NULL) { - sleep(5); + sleep(current_backoff_seconds); + if(current_backoff_seconds < SST_BACKOFF_SECONDS_MAX) + { + ++current_backoff_seconds; + } continue; } memcpy(CQEntry, res, sizeof(struct fi_cq_data_entry)); From de90225c026ac510f20f3b0f7427325e509bfb6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 11 Dec 2023 15:31:50 +0100 Subject: [PATCH 13/50] Remove some debugging output --- source/adios2/toolkit/sst/dp/rdma_dp.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 800910b3ed..26d0758792 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -78,7 +78,6 @@ int sst_fi_mr_reg( /* additional parameters for binding the mr to the endpoint*/ struct fid_ep *endpoint, int mr_mode) { - printf("Registering from %p to %p\n", buf, buf + len); int res = fi_mr_reg(domain, buf, len, acs, offset, requested_key, flags, mr, context); int is_mr_endpoint = (mr_mode & FI_MR_ENDPOINT) != 0; if (!is_mr_endpoint) @@ -192,7 +191,6 @@ static void *make_progress(void *params_) while (params->do_continue) { - printf("~~~~~~~~a little bit of progress?\n"); ssize_t rc = fi_cq_read(params->cq_signal, (void *)CQEntry, 1); if (rc < 1) { @@ -214,7 +212,6 @@ static void *make_progress(void *params_) } else { - printf("GOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOT COMPLETION\n"); struct cq_event_list *next_item = malloc(sizeof(struct cq_event_list)); next_item->value = CQEntry; next_item->next = NULL; @@ -1705,8 +1702,6 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo do { - printf("Going into fi_read() from %p (= %p + %lu) to %p\n", Addr, Info->Block, Offset, - Addr + Length); rc = fi_read(Fabric->signal, Buffer, Length, LocalDesc, SrcAddress, (uint64_t)Addr, Info->Key, ret); // if(rc == -EAGAIN) From aa6cdee68cbd7ce90988bd827ca8cea055b76d7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 11 Dec 2023 15:32:29 +0100 Subject: [PATCH 14/50] Revert "Tracing output" This reverts commit 4a312d3fd8319a5be2c31beedd31266fbc0c57a5. --- source/adios2/toolkit/sst/dp/rdma_dp.c | 36 -------------------------- 1 file changed, 36 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 26d0758792..0124d039af 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -117,7 +117,6 @@ int sst_fi_mr_reg( */ int guard_fi_return(int code, CP_Services Svcs, CManager cm, char const *msg) { - printf("[RDMA CALL guard_fi_return]\n"); if (code != FI_SUCCESS) { Svcs->verbose(cm, DPCriticalVerbose, "%s: %s (%lu)\n", msg, fi_strerror(code), code); @@ -324,7 +323,6 @@ static char const *get_preferred_domain(struct _SstParams *Params) static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, CP_Services Svcs, void *CP_Stream, char const *ifname) { - printf("[RDMA CALL init_fabric]\n"); struct fi_info *hints, *info, *originfo, *useinfo; struct fi_av_attr av_attr = {FI_AV_UNSPEC}; struct fi_cq_attr cq_attr = {0}; @@ -674,7 +672,6 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, static void fini_fabric(struct fabric_state *fabric, CP_Services Svcs, void *CP_Stream) { - printf("[RDMA CALL fini_fabric]\n"); fabric->cq_manual_progress.do_continue = 0; // free other stuff @@ -916,7 +913,6 @@ typedef struct _RdmaWriterContactInfo static TimestepList GetStep(Rdma_WS_Stream Stream, long Timestep) { - printf("[RDMA CALL GetStep]\n"); TimestepList Step; pthread_mutex_lock(&ts_mutex); @@ -934,7 +930,6 @@ static TimestepList GetStep(Rdma_WS_Stream Stream, long Timestep) static int get_cxi_auth_key_from_env(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params, struct cxi_auth_key *key, char **used_device) { - printf("[RDMA CALL get_cxi_auth_key_from_env]\n"); int vni, first_vni, second_vni, svc_id; // Just some safety against faulty strings in string processing. @@ -1106,7 +1101,6 @@ static int get_cxi_auth_key_from_env(CP_Services Svcs, void *CP_Stream, struct _ static int get_cxi_auth_key_from_writer(struct cxi_auth_key *key, attr_list WriterContact) { - printf("[RDMA CALL get_cxi_auth_key_from_writer]\n"); long vni; if (!get_long_attr(WriterContact, attr_atom_from_string("vni"), &vni)) { @@ -1121,7 +1115,6 @@ static DP_RS_Stream RdmaInitReader(CP_Services Svcs, void *CP_Stream, void **Rea struct _SstParams *Params, attr_list WriterContact, SstStats Stats) { - printf("[RDMA CALL RdmaInitReader]\n"); Rdma_RS_Stream Stream = malloc(sizeof(struct _Rdma_RS_Stream)); SMPI_Comm comm = Svcs->getMPIComm(CP_Stream); RdmaReaderContactInfo ContactInfo = malloc(sizeof(struct _RdmaReaderContactInfo)); @@ -1272,7 +1265,6 @@ static DP_RS_Stream RdmaInitReader(CP_Services Svcs, void *CP_Stream, void **Rea static void RdmaReadPatternLocked(CP_Services Svcs, DP_WSR_Stream WSRStream_v, long EffectiveTimestep) { - printf("[RDMA CALL RdmaReadPatternLocked]\n"); Rdma_WSR_Stream WSR_Stream = (Rdma_WSR_Stream)WSRStream_v; Rdma_WS_Stream WS_Stream = WSR_Stream->WS_Stream; @@ -1296,7 +1288,6 @@ static void RdmaReadPatternLocked(CP_Services Svcs, DP_WSR_Stream WSRStream_v, static void RdmaWritePatternLocked(CP_Services Svcs, DP_RS_Stream Stream_v, long EffectiveTimestep) { - printf("[RDMA CALL RdmaWritePatternLocked]\n"); Rdma_RS_Stream Stream = (Rdma_RS_Stream)Stream_v; if (Stream->PreloadAvail) @@ -1319,7 +1310,6 @@ static void RdmaWritePatternLocked(CP_Services Svcs, DP_RS_Stream Stream_v, long static DP_WS_Stream RdmaInitWriter(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params, attr_list DPAttrs, SstStats Stats) { - printf("[RDMA CALL RdmaInitWriter]\n"); Rdma_WS_Stream Stream = malloc(sizeof(struct _Rdma_WS_Stream)); SMPI_Comm comm = Svcs->getMPIComm(CP_Stream); char *PreloadEnv; @@ -1468,7 +1458,6 @@ static DP_WSR_Stream RdmaInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_S void **providedReaderInfo_v, void **WriterContactInfoPtr) { - printf("[RDMA CALL RdmaInitWriterPerReader]\n"); Rdma_WS_Stream WS_Stream = (Rdma_WS_Stream)WS_Stream_v; Rdma_WSR_Stream WSR_Stream = malloc(sizeof(*WSR_Stream)); FabricState Fabric = WS_Stream->Fabric; @@ -1563,7 +1552,6 @@ static void RdmaProvideWriterDataToReader(CP_Services Svcs, DP_RS_Stream RS_Stre int writerCohortSize, CP_PeerCohort PeerCohort, void **providedWriterInfo_v) { - printf("[RDMA CALL RdmaProvideWriterDataToReader]\n"); Rdma_RS_Stream RS_Stream = (Rdma_RS_Stream)RS_Stream_v; FabricState Fabric = RS_Stream->Fabric; RdmaWriterContactInfo *providedWriterInfo = (RdmaWriterContactInfo *)providedWriterInfo_v; @@ -1601,7 +1589,6 @@ static void RdmaProvideWriterDataToReader(CP_Services Svcs, DP_RS_Stream RS_Stre static void LogRequest(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, long Timestep, size_t Offset, size_t Length) { - printf("[RDMA CALL LogRequest]\n"); RdmaStepLogEntry *StepLog_p; RdmaStepLogEntry StepLog; RdmaBuffer LogEntry; @@ -1660,7 +1647,6 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo size_t Offset, size_t Length, void *Buffer, RdmaBufferHandle Info, RdmaCompletionHandle *ret_v) { - printf("[RDMA CALL PostRead]\n"); FabricState Fabric = RS_Stream->Fabric; fi_addr_t SrcAddress = RS_Stream->WriterAddr[Rank]; void *LocalDesc = NULL; @@ -1732,7 +1718,6 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo static RdmaBuffer GetRequest(Rdma_RS_Stream Stream, RdmaStepLogEntry StepLog, int Rank, size_t Offset, size_t Length) { - printf("[RDMA CALL GetRequest]\n"); RdmaRankReqLog RankLog = &StepLog->RankLog[Rank]; RdmaBuffer Req; @@ -1841,7 +1826,6 @@ static void *RdmaReadRemoteMemory(CP_Services Svcs, DP_RS_Stream Stream_v, int R static void RdmaNotifyConnFailure(CP_Services Svcs, DP_RS_Stream Stream_v, int FailedPeerRank) { - printf("[RDMA CALL RdmaNotifyConnFailure]\n"); /* DP_RS_Stream is the return from InitReader */ Rdma_RS_Stream Stream = (Rdma_RS_Stream)Stream_v; Svcs->verbose(Stream->CP_Stream, DPTraceVerbose, @@ -1855,7 +1839,6 @@ static void RdmaNotifyConnFailure(CP_Services Svcs, DP_RS_Stream Stream_v, int F */ static int DoPushWait(CP_Services Svcs, Rdma_RS_Stream Stream, RdmaCompletionHandle Handle) { - printf("[RDMA CALL DoPushWait]\n"); FabricState Fabric = Stream->Fabric; RdmaStepLogEntry StepLog = Stream->PreloadStepLog; RdmaRankReqLog RankLog; @@ -1928,7 +1911,6 @@ static int DoPushWait(CP_Services Svcs, Rdma_RS_Stream Stream, RdmaCompletionHan static int WaitForAnyPull(CP_Services Svcs, Rdma_RS_Stream Stream) { - printf("[RDMA CALL WaitForAnyPull]\n"); FabricState Fabric = Stream->Fabric; RdmaCompletionHandle Handle_t; struct fi_cq_data_entry CQEntry = {0}; @@ -1953,7 +1935,6 @@ static int WaitForAnyPull(CP_Services Svcs, Rdma_RS_Stream Stream) static int DoPullWait(CP_Services Svcs, Rdma_RS_Stream Stream, RdmaCompletionHandle Handle) { - printf("[RDMA CALL DoPullWait]\n"); while (Handle->Pending > 0) { if (WaitForAnyPull(Svcs, Stream) == 0) @@ -1968,7 +1949,6 @@ static int DoPullWait(CP_Services Svcs, Rdma_RS_Stream Stream, RdmaCompletionHan */ static int RdmaWaitForCompletion(CP_Services Svcs, void *Handle_v) { - printf("[RDMA CALL RdmaWaitForCompletion]\n"); RdmaCompletionHandle Handle = (RdmaCompletionHandle)Handle_v; Rdma_RS_Stream Stream = Handle->CPStream; @@ -1988,7 +1968,6 @@ static void RdmaProvideTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, struct struct _SstData *LocalMetadata, long Timestep, void **TimestepInfoPtr) { - printf("[RDMA CALL RdmaProvideTimestep]\n"); Rdma_WS_Stream Stream = (Rdma_WS_Stream)Stream_v; TimestepList Entry = malloc(sizeof(struct _TimestepEntry)); RdmaBufferHandle Info = malloc(sizeof(struct _RdmaBufferHandle)); @@ -2033,7 +2012,6 @@ static void RdmaProvideTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, struct static void RdmaReleaseTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, long Timestep) { - printf("[RDMA CALL RdmaReleaseTimestep]\n"); Rdma_WS_Stream Stream = (Rdma_WS_Stream)Stream_v; TimestepList *List = &Stream->Timesteps; TimestepList ReleaseTSL; @@ -2076,7 +2054,6 @@ static void RdmaReleaseTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, long Ti static void RdmaDestroyRankReqLog(Rdma_RS_Stream RS_Stream, RdmaRankReqLog RankReqLog) { - printf("[RDMA CALL RdmaDestroyRankReqLog]\n"); int i; for (i = 0; i < RS_Stream->WriterCohortSize; i++) @@ -2091,7 +2068,6 @@ static void RdmaDestroyRankReqLog(Rdma_RS_Stream RS_Stream, RdmaRankReqLog RankR static void RdmaDestroyReader(CP_Services Svcs, DP_RS_Stream RS_Stream_v) { - printf("[RDMA CALL RdmaDestroyReader]\n"); Rdma_RS_Stream RS_Stream = (Rdma_RS_Stream)RS_Stream_v; RdmaStepLogEntry StepLog = RS_Stream->StepLog; RdmaStepLogEntry tStepLog; @@ -2131,7 +2107,6 @@ static void RdmaDestroyReader(CP_Services Svcs, DP_RS_Stream RS_Stream_v) static void RdmaDestroyWriterPerReader(CP_Services Svcs, DP_WSR_Stream WSR_Stream_v) { - printf("[RDMA CALL RdmaDestroyWriterPerReader]\n"); Rdma_WSR_Stream WSR_Stream = {0}; memcpy(&WSR_Stream, &WSR_Stream_v, sizeof(Rdma_WSR_Stream)); Rdma_WS_Stream WS_Stream = WSR_Stream->WS_Stream; @@ -2195,7 +2170,6 @@ static FMStructDescRec RdmaBufferHandleStructs[] = { static void RdmaDestroyWriter(CP_Services Svcs, DP_WS_Stream WS_Stream_v) { - printf("[RDMA CALL RdmaDestroyWriter]\n"); Rdma_WS_Stream WS_Stream = (Rdma_WS_Stream)WS_Stream_v; long Timestep; #ifdef SST_HAVE_CRAY_DRC @@ -2269,7 +2243,6 @@ static struct _CP_DP_Interface RdmaDPInterface = {0}; */ static int RdmaGetPriority(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params) { - printf("[RDMA CALL RdmaGetPriority]\n"); return 100; struct fi_info *hints, *info, *originfo; char const *ifname; @@ -2389,13 +2362,11 @@ static int RdmaGetPriority(CP_Services Svcs, void *CP_Stream, struct _SstParams */ static void RdmaUnGetPriority(CP_Services Svcs, void *CP_Stream) { - printf("[RDMA CALL RdmaUnGetPriority]\n"); Svcs->verbose(CP_Stream, DPPerStepVerbose, "RDMA Dataplane unloading\n"); } static void PushData(CP_Services Svcs, Rdma_WSR_Stream Stream, TimestepList Step, int BufferSlot) { - printf("[RDMA CALL PushData]\n"); Rdma_WS_Stream WS_Stream = Stream->WS_Stream; FabricState Fabric = WS_Stream->Fabric; RdmaRankReqLog RankReq = Stream->PreloadReq; @@ -2447,7 +2418,6 @@ static void PushData(CP_Services Svcs, Rdma_WSR_Stream Stream, TimestepList Step static void RdmaReaderRegisterTimestep(CP_Services Svcs, DP_WSR_Stream WSRStream_v, long Timestep, SstPreloadModeType PreloadMode) { - printf("[RDMA CALL RdmaReaderRegisterTimestep]\n"); Rdma_WSR_Stream WSR_Stream = (Rdma_WSR_Stream)WSRStream_v; Rdma_WS_Stream WS_Stream = WSR_Stream->WS_Stream; TimestepList Step; @@ -2475,7 +2445,6 @@ static void RdmaReaderRegisterTimestep(CP_Services Svcs, DP_WSR_Stream WSRStream static void PostPreload(CP_Services Svcs, Rdma_RS_Stream Stream, long Timestep) { - printf("[RDMA CALL PostPreload]\n"); RdmaStepLogEntry StepLog; FabricState Fabric = Stream->Fabric; RdmaBuffer PreloadBuffer = &Stream->PreloadBuffer; @@ -2631,7 +2600,6 @@ static void PostPreload(CP_Services Svcs, Rdma_RS_Stream Stream, long Timestep) static void RdmaTimestepArrived(CP_Services Svcs, DP_RS_Stream Stream_v, long Timestep, SstPreloadModeType PreloadMode) { - printf("[RDMA CALL RdmaTimestepArrived]\n"); Rdma_RS_Stream Stream = (Rdma_RS_Stream)Stream_v; Svcs->verbose(Stream->CP_Stream, DPTraceVerbose, "%s with Timestep = %li, PreloadMode = %d\n", @@ -2659,7 +2627,6 @@ static void RdmaTimestepArrived(CP_Services Svcs, DP_RS_Stream Stream_v, long Ti static void RdmaReaderReleaseTimestep(CP_Services Svcs, DP_RS_Stream Stream_v, long Timestep) { - printf("[RDMA CALL RdmaReaderReleaseTimestep]\n"); Rdma_RS_Stream Stream = (Rdma_RS_Stream)Stream_v; pthread_mutex_lock(&ts_mutex); @@ -2677,7 +2644,6 @@ static void RdmaReaderReleaseTimestep(CP_Services Svcs, DP_RS_Stream Stream_v, l static void PullSelection(CP_Services Svcs, Rdma_WSR_Stream Stream) { - printf("[RDMA CALL PullSelection]\n"); Rdma_WS_Stream WS_Stream = Stream->WS_Stream; FabricState Fabric = WS_Stream->Fabric; RdmaBuffer ReaderRoll = (RdmaBuffer)Stream->ReaderRoll->Handle.Block; @@ -2761,7 +2727,6 @@ static void PullSelection(CP_Services Svcs, Rdma_WSR_Stream Stream) static void CompletePush(CP_Services Svcs, Rdma_WSR_Stream Stream, TimestepList Step) { - printf("[RDMA CALL CompletePush]\n"); Rdma_WS_Stream WS_Stream = Stream->WS_Stream; FabricState Fabric = WS_Stream->Fabric; TimestepList CQStep; @@ -2808,7 +2773,6 @@ static void CompletePush(CP_Services Svcs, Rdma_WSR_Stream Stream, TimestepList static void RdmaReleaseTimestepPerReader(CP_Services Svcs, DP_WSR_Stream Stream_v, long Timestep) { - printf("[RDMA CALL RdmaReleaseTimestepPerReader]\n"); Rdma_WSR_Stream Stream = (Rdma_WSR_Stream)Stream_v; Rdma_WS_Stream WS_Stream = Stream->WS_Stream; TimestepList Step = GetStep(WS_Stream, Timestep); From 75be33d991b09cedd295b4b1fd412a0e071858b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 11 Dec 2023 15:40:43 +0100 Subject: [PATCH 15/50] Cleanup --- source/adios2/toolkit/sst/dp/rdma_dp.c | 34 ++++++++------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 0124d039af..7a816cbcba 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -334,13 +334,6 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, hints->mode = FI_CONTEXT | FI_LOCAL_MR | FI_CONTEXT2 | FI_MSG_PREFIX | FI_ASYNC_IOV | FI_RX_CQ_DATA; hints->ep_attr->type = FI_EP_RDM; - { - char const *prov_name = "shm"; - size_t len = strlen(prov_name) + 1; - char *construct_prov_name = malloc(len); - memcpy(construct_prov_name, prov_name, len); - hints->fabric_attr->prov_name = construct_prov_name; - } uint32_t fi_version; #ifdef SST_HAVE_CRAY_CXI @@ -378,7 +371,7 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, hints->domain_attr->data_progress = FI_PROGRESS_AUTO; } #else - fi_version = FI_VERSION(1, 18); + fi_version = FI_VERSION(1, 5); // Alternatively, one could set mr_mode to // FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL @@ -388,7 +381,7 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, // The RDMA DP is able to deal with this appropriately, and does so right // before calling fi_fabric() further below in this function. // The main reason for keeping FI_MR_BASIC here is backward compatibility. - hints->domain_attr->mr_mode = FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY; + hints->domain_attr->mr_mode = FI_MR_BASIC; hints->domain_attr->control_progress = FI_PROGRESS_AUTO; // hints->domain_attr->data_progress = FI_PROGRESS_AUTO; #endif @@ -426,12 +419,6 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, char *prov_name = info->fabric_attr->prov_name; char *domain_name = info->domain_attr->name; - // if (info->tx_attr->inject_size > 0) - // { - // info = info->next; - // continue; - // } - if (ifname && strcmp(ifname, domain_name) == 0) { Svcs->verbose(CP_Stream, DPTraceVerbose, "using interface set by FABRIC_IFACE.\n"); @@ -597,7 +584,10 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, } av_attr.type = FI_AV_MAP; - // av_attr.count = DP_AV_DEF_SIZE; + if (strncmp(fabric->info->fabric_attr->prov_name, "shm", 4) != 0) + { + av_attr.count = DP_AV_DEF_SIZE; + } av_attr.ep_per_node = 0; result = fi_av_open(fabric->domain, &av_attr, &fabric->av, fabric->ctx); if (result != FI_SUCCESS) @@ -1246,7 +1236,8 @@ static DP_RS_Stream RdmaInitReader(CP_Services Svcs, void *CP_Stream, void **Rea } if (Stream->Fabric->info->addr_format == FI_ADDR_STR) { - printf("Reader address: %s\n", (char const *)ContactInfo->Address); + Svcs->verbose(Stream, DPSummaryVerbose, "Reader address: %s\n", + (char const *)ContactInfo->Address); } Stream->PreloadStep = -1; @@ -1519,7 +1510,8 @@ static DP_WSR_Stream RdmaInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_S } if (Fabric->info->addr_format == FI_ADDR_STR) { - printf("Writer address: %s\n", (char const *)ContactInfo->Address); + Svcs->verbose(WS_Stream->CP_Stream, DPSummaryVerbose, "Writer address: %s\n", + (char const *)ContactInfo->Address); } ReaderRollHandle = &ContactInfo->ReaderRollHandle; @@ -1690,12 +1682,6 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo { rc = fi_read(Fabric->signal, Buffer, Length, LocalDesc, SrcAddress, (uint64_t)Addr, Info->Key, ret); - // if(rc == -EAGAIN) - // { - // struct fi_cq_data_entry CQEntry = {0}; - // ssize_t sq_rc; - // sq_rc = fi_cq_sread(Fabric->cq_signal, (void *)(&CQEntry), 1, NULL, -1); - // } } while (rc == -EAGAIN); if (rc != 0) From df263069cdc111551c7e0e1bc2364b43451dce4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 11 Dec 2023 15:44:01 +0100 Subject: [PATCH 16/50] Do not specify data_progress --- source/adios2/toolkit/sst/dp/rdma_dp.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 7a816cbcba..381bbe8f52 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -343,7 +343,8 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, hints->domain_attr->mr_mode = FI_MR_ENDPOINT; hints->domain_attr->control_progress = FI_PROGRESS_MANUAL; - hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + // data progress unspecified, both are fine + // hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; // Authentication is needed // TODO: the first ID in SLINGSHOT_SVC_IDS is chosen, but we should @@ -368,7 +369,8 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, hints->domain_attr->mr_mode = FI_MR_BASIC; hints->domain_attr->control_progress = FI_PROGRESS_AUTO; - hints->domain_attr->data_progress = FI_PROGRESS_AUTO; + // data progress unspecified, both are fine + // hints->domain_attr->data_progress = FI_PROGRESS_AUTO; } #else fi_version = FI_VERSION(1, 5); @@ -383,6 +385,7 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, // The main reason for keeping FI_MR_BASIC here is backward compatibility. hints->domain_attr->mr_mode = FI_MR_BASIC; hints->domain_attr->control_progress = FI_PROGRESS_AUTO; + // data progress unspecified, both are fine // hints->domain_attr->data_progress = FI_PROGRESS_AUTO; #endif @@ -2229,7 +2232,6 @@ static struct _CP_DP_Interface RdmaDPInterface = {0}; */ static int RdmaGetPriority(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params) { - return 100; struct fi_info *hints, *info, *originfo; char const *ifname; char *forkunsafe; @@ -2257,7 +2259,8 @@ static int RdmaGetPriority(CP_Services Svcs, void *CP_Stream, struct _SstParams hints->domain_attr->mr_mode = FI_MR_ENDPOINT; hints->domain_attr->control_progress = FI_PROGRESS_MANUAL; - hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + // data progress unspecified, both are fine + // hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; } else { @@ -2270,7 +2273,8 @@ static int RdmaGetPriority(CP_Services Svcs, void *CP_Stream, struct _SstParams hints->domain_attr->mr_mode = FI_MR_BASIC; hints->domain_attr->control_progress = FI_PROGRESS_AUTO; - hints->domain_attr->data_progress = FI_PROGRESS_AUTO; + // data progress unspecified, both are fine + // hints->domain_attr->data_progress = FI_PROGRESS_AUTO; } ifname = get_preferred_domain(Params); From b284dc7c4c407df55ddad590aeb0347e5bb47172 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 11 Dec 2023 15:49:30 +0100 Subject: [PATCH 17/50] No longer use FI_MR_BASIC shm complains about it, so use the equivalent FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL --- source/adios2/toolkit/sst/dp/rdma_dp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 381bbe8f52..196cb70695 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -203,6 +203,7 @@ static void *make_progress(void *params_) fi_strerror(error.err), fi_cq_strerror(params->cq_signal, error.err, error.err_data, NULL, error.len)); } + printf("Backung off for %d seconds\n", current_backoff_seconds); sleep(current_backoff_seconds); if(current_backoff_seconds < SST_BACKOFF_SECONDS_MAX) { @@ -367,7 +368,8 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, { fi_version = FI_VERSION(1, 5); - hints->domain_attr->mr_mode = FI_MR_BASIC; + hints->domain_attr->mr_mode = + FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL; hints->domain_attr->control_progress = FI_PROGRESS_AUTO; // data progress unspecified, both are fine // hints->domain_attr->data_progress = FI_PROGRESS_AUTO; @@ -383,7 +385,7 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, // The RDMA DP is able to deal with this appropriately, and does so right // before calling fi_fabric() further below in this function. // The main reason for keeping FI_MR_BASIC here is backward compatibility. - hints->domain_attr->mr_mode = FI_MR_BASIC; + hints->domain_attr->mr_mode = FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL; hints->domain_attr->control_progress = FI_PROGRESS_AUTO; // data progress unspecified, both are fine // hints->domain_attr->data_progress = FI_PROGRESS_AUTO; From 0995c7f993134ae942c14227b99f5cc87604a8d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 11 Dec 2023 18:46:03 +0100 Subject: [PATCH 18/50] Try enqueuing the fi_read() on the thread --- source/adios2/toolkit/sst/dp/rdma_dp.c | 107 ++++++++++++++++++++----- 1 file changed, 89 insertions(+), 18 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 196cb70695..7ae618341b 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -124,6 +124,22 @@ int guard_fi_return(int code, CP_Services Svcs, CManager cm, char const *msg) return code; } +struct pending_fi_reads +{ + int source_rank; + + struct fid_ep *ep; + void *buf; + size_t len; + void *desc; + fi_addr_t src_addr; + uint64_t addr; + uint64_t key; + void *context; + + struct pending_fi_reads *next; +}; + struct cq_event_list { struct fi_cq_data_entry *value; @@ -136,6 +152,9 @@ struct cq_manual_progress pthread_mutex_t cq_event_list_mutex; struct cq_event_list *cq_event_list; + struct pending_fi_reads *pending_fi_reads; + pthread_mutex_t pending_fi_reads_mutex; + CP_Services Svcs; void *Stream; int do_continue; @@ -181,6 +200,54 @@ struct fi_cq_data_entry *cq_manual_progress_pop(struct cq_manual_progress *self) #define SST_BACKOFF_SECONDS_MAX 5 +static void process_pending_fi_reads(struct cq_manual_progress *self) +{ + pthread_mutex_lock(&self->pending_fi_reads_mutex); + + struct pending_fi_reads *head = self->pending_fi_reads; + self->pending_fi_reads = NULL; + struct pending_fi_reads **reinsert = &self->pending_fi_reads; + + while (head) + { + printf("Processing read of size %lu from %p to %p\n", head->len, (void *)head->addr, + head->buf); + ssize_t rc = fi_read(head->ep, head->buf, head->len, head->desc, head->src_addr, head->addr, + head->key, head->context); + switch (rc) + { + case -EAGAIN: { + struct pending_fi_reads *next = head->next; + head->next = NULL; + *reinsert = head; + reinsert = &head->next; + head = next; + break; + } + case 0: { + self->Svcs->verbose(self->Stream, DPTraceVerbose, + "Posted RDMA get for Writer Rank %d for handle %p\n", + head->source_rank, head->context); + struct pending_fi_reads *next = head->next; + free(head); + head = next; + break; + } + default: { + self->Svcs->verbose( + self->Stream, DPCriticalVerbose, + "fi_read failed with code %d. Will skip, but there might be hangups.\n", rc); + struct pending_fi_reads *next = head->next; + free(head); + head = next; + break; + } + } + } + + pthread_mutex_unlock(&self->pending_fi_reads_mutex); +} + static void *make_progress(void *params_) { struct cq_manual_progress *params = (struct cq_manual_progress *)params_; @@ -190,6 +257,8 @@ static void *make_progress(void *params_) while (params->do_continue) { + process_pending_fi_reads(params); + ssize_t rc = fi_cq_read(params->cq_signal, (void *)CQEntry, 1); if (rc < 1) { @@ -203,7 +272,7 @@ static void *make_progress(void *params_) fi_strerror(error.err), fi_cq_strerror(params->cq_signal, error.err, error.err_data, NULL, error.len)); } - printf("Backung off for %d seconds\n", current_backoff_seconds); + printf("Backing off for %d seconds\n", current_backoff_seconds); sleep(current_backoff_seconds); if(current_backoff_seconds < SST_BACKOFF_SECONDS_MAX) { @@ -1649,7 +1718,6 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo void *LocalDesc = NULL; uint8_t *Addr; RdmaCompletionHandle ret; - ssize_t rc; *ret_v = malloc(sizeof(struct _RdmaCompletionHandle)); ret = *ret_v; @@ -1683,27 +1751,30 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo "Remote read target is Rank %d (Offset = %zi, Length = %zi)\n", Rank, Offset, Length); - do { - rc = fi_read(Fabric->signal, Buffer, Length, LocalDesc, SrcAddress, (uint64_t)Addr, - Info->Key, ret); - } while (rc == -EAGAIN); + struct pending_fi_reads *post_read = malloc(sizeof(struct pending_fi_reads)); + post_read->source_rank = Rank; + post_read->ep = Fabric->signal; + post_read->buf = Buffer; + post_read->len = Length; + post_read->desc = LocalDesc; + post_read->src_addr = SrcAddress; + post_read->addr = (uint64_t)Addr; + post_read->key = Info->Key; + post_read->context = ret; - if (rc != 0) - { - Svcs->verbose(RS_Stream->CP_Stream, DPCriticalVerbose, "fi_read failed with code %d.\n", - rc); - return (rc); - } - else - { + pthread_mutex_lock(&Fabric->cq_manual_progress.pending_fi_reads_mutex); - Svcs->verbose(RS_Stream->CP_Stream, DPTraceVerbose, - "Posted RDMA get for Writer Rank %d for handle %p\n", Rank, (void *)ret); - RS_Stream->PendingReads++; + post_read->next = Fabric->cq_manual_progress.pending_fi_reads; + Fabric->cq_manual_progress.pending_fi_reads = post_read; + + pthread_mutex_unlock(&Fabric->cq_manual_progress.pending_fi_reads_mutex); } + // @todo: replace with an atomic and update in the thread? + // better safeguard against failures: if a read fails, we should not count it + RS_Stream->PendingReads++; - return (rc); + return (0); } static RdmaBuffer GetRequest(Rdma_RS_Stream Stream, RdmaStepLogEntry StepLog, int Rank, From 5b90dd259cf521695c06beea23a28db53489d8ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Tue, 12 Dec 2023 14:40:11 +0100 Subject: [PATCH 19/50] Request n items at once --- source/adios2/toolkit/sst/dp/rdma_dp.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 7ae618341b..0962faf849 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -251,7 +251,8 @@ static void process_pending_fi_reads(struct cq_manual_progress *self) static void *make_progress(void *params_) { struct cq_manual_progress *params = (struct cq_manual_progress *)params_; - struct fi_cq_data_entry *CQEntry = malloc(sizeof(struct fi_cq_data_entry)); + size_t const n_entries = 100; + struct fi_cq_data_entry *CQEntries = malloc(n_entries * sizeof(struct fi_cq_data_entry)); unsigned int current_backoff_seconds = 0; @@ -259,7 +260,7 @@ static void *make_progress(void *params_) { process_pending_fi_reads(params); - ssize_t rc = fi_cq_read(params->cq_signal, (void *)CQEntry, 1); + ssize_t rc = fi_cq_read(params->cq_signal, (void *)CQEntries, n_entries); if (rc < 1) { struct fi_cq_err_entry error = {.err = 0}; @@ -274,21 +275,27 @@ static void *make_progress(void *params_) } printf("Backing off for %d seconds\n", current_backoff_seconds); sleep(current_backoff_seconds); - if(current_backoff_seconds < SST_BACKOFF_SECONDS_MAX) + if (current_backoff_seconds < SST_BACKOFF_SECONDS_MAX) { ++current_backoff_seconds; } } else { - struct cq_event_list *next_item = malloc(sizeof(struct cq_event_list)); - next_item->value = CQEntry; - next_item->next = NULL; - cq_manual_progress_push(params, next_item); - CQEntry = malloc(sizeof(struct fi_cq_data_entry)); + for (size_t i = 0; i < rc; ++i) + { + + struct cq_event_list *next_item = malloc(sizeof(struct cq_event_list)); + struct fi_cq_data_entry *new_entry = malloc(sizeof(struct fi_cq_data_entry)); + memcpy(new_entry, &CQEntries[i], sizeof(struct fi_cq_data_entry)); + next_item->value = new_entry; + next_item->next = NULL; + cq_manual_progress_push(params, next_item); + } current_backoff_seconds = 0; } } + free(CQEntries); return NULL; } From 3e14cd321e8b3ce2fe69b4f850ec00db73eef6b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Tue, 12 Dec 2023 15:09:37 +0100 Subject: [PATCH 20/50] Revert last two commits Revert "Request n items at once" This reverts commit 4b4909e6bcaa322585df93ca4e917c7065665ee0. Revert "Try enqueuing the fi_read() on the thread" This reverts commit 6d550bcb10e3b0db03a4050ba5018319af23a07a. --- source/adios2/toolkit/sst/dp/rdma_dp.c | 130 +++++-------------------- 1 file changed, 26 insertions(+), 104 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 0962faf849..196cb70695 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -124,22 +124,6 @@ int guard_fi_return(int code, CP_Services Svcs, CManager cm, char const *msg) return code; } -struct pending_fi_reads -{ - int source_rank; - - struct fid_ep *ep; - void *buf; - size_t len; - void *desc; - fi_addr_t src_addr; - uint64_t addr; - uint64_t key; - void *context; - - struct pending_fi_reads *next; -}; - struct cq_event_list { struct fi_cq_data_entry *value; @@ -152,9 +136,6 @@ struct cq_manual_progress pthread_mutex_t cq_event_list_mutex; struct cq_event_list *cq_event_list; - struct pending_fi_reads *pending_fi_reads; - pthread_mutex_t pending_fi_reads_mutex; - CP_Services Svcs; void *Stream; int do_continue; @@ -200,67 +181,16 @@ struct fi_cq_data_entry *cq_manual_progress_pop(struct cq_manual_progress *self) #define SST_BACKOFF_SECONDS_MAX 5 -static void process_pending_fi_reads(struct cq_manual_progress *self) -{ - pthread_mutex_lock(&self->pending_fi_reads_mutex); - - struct pending_fi_reads *head = self->pending_fi_reads; - self->pending_fi_reads = NULL; - struct pending_fi_reads **reinsert = &self->pending_fi_reads; - - while (head) - { - printf("Processing read of size %lu from %p to %p\n", head->len, (void *)head->addr, - head->buf); - ssize_t rc = fi_read(head->ep, head->buf, head->len, head->desc, head->src_addr, head->addr, - head->key, head->context); - switch (rc) - { - case -EAGAIN: { - struct pending_fi_reads *next = head->next; - head->next = NULL; - *reinsert = head; - reinsert = &head->next; - head = next; - break; - } - case 0: { - self->Svcs->verbose(self->Stream, DPTraceVerbose, - "Posted RDMA get for Writer Rank %d for handle %p\n", - head->source_rank, head->context); - struct pending_fi_reads *next = head->next; - free(head); - head = next; - break; - } - default: { - self->Svcs->verbose( - self->Stream, DPCriticalVerbose, - "fi_read failed with code %d. Will skip, but there might be hangups.\n", rc); - struct pending_fi_reads *next = head->next; - free(head); - head = next; - break; - } - } - } - - pthread_mutex_unlock(&self->pending_fi_reads_mutex); -} - static void *make_progress(void *params_) { struct cq_manual_progress *params = (struct cq_manual_progress *)params_; - size_t const n_entries = 100; - struct fi_cq_data_entry *CQEntries = malloc(n_entries * sizeof(struct fi_cq_data_entry)); + struct fi_cq_data_entry *CQEntry = malloc(sizeof(struct fi_cq_data_entry)); unsigned int current_backoff_seconds = 0; while (params->do_continue) { - process_pending_fi_reads(params); - - ssize_t rc = fi_cq_read(params->cq_signal, (void *)CQEntries, n_entries); + ssize_t rc = fi_cq_read(params->cq_signal, (void *)CQEntry, 1); if (rc < 1) { struct fi_cq_err_entry error = {.err = 0}; @@ -273,29 +203,23 @@ static void *make_progress(void *params_) fi_strerror(error.err), fi_cq_strerror(params->cq_signal, error.err, error.err_data, NULL, error.len)); } - printf("Backing off for %d seconds\n", current_backoff_seconds); + printf("Backung off for %d seconds\n", current_backoff_seconds); sleep(current_backoff_seconds); - if (current_backoff_seconds < SST_BACKOFF_SECONDS_MAX) + if(current_backoff_seconds < SST_BACKOFF_SECONDS_MAX) { ++current_backoff_seconds; } } else { - for (size_t i = 0; i < rc; ++i) - { - - struct cq_event_list *next_item = malloc(sizeof(struct cq_event_list)); - struct fi_cq_data_entry *new_entry = malloc(sizeof(struct fi_cq_data_entry)); - memcpy(new_entry, &CQEntries[i], sizeof(struct fi_cq_data_entry)); - next_item->value = new_entry; - next_item->next = NULL; - cq_manual_progress_push(params, next_item); - } + struct cq_event_list *next_item = malloc(sizeof(struct cq_event_list)); + next_item->value = CQEntry; + next_item->next = NULL; + cq_manual_progress_push(params, next_item); + CQEntry = malloc(sizeof(struct fi_cq_data_entry)); current_backoff_seconds = 0; } } - free(CQEntries); return NULL; } @@ -1725,6 +1649,7 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo void *LocalDesc = NULL; uint8_t *Addr; RdmaCompletionHandle ret; + ssize_t rc; *ret_v = malloc(sizeof(struct _RdmaCompletionHandle)); ret = *ret_v; @@ -1758,30 +1683,27 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo "Remote read target is Rank %d (Offset = %zi, Length = %zi)\n", Rank, Offset, Length); + do { - struct pending_fi_reads *post_read = malloc(sizeof(struct pending_fi_reads)); - post_read->source_rank = Rank; - post_read->ep = Fabric->signal; - post_read->buf = Buffer; - post_read->len = Length; - post_read->desc = LocalDesc; - post_read->src_addr = SrcAddress; - post_read->addr = (uint64_t)Addr; - post_read->key = Info->Key; - post_read->context = ret; + rc = fi_read(Fabric->signal, Buffer, Length, LocalDesc, SrcAddress, (uint64_t)Addr, + Info->Key, ret); + } while (rc == -EAGAIN); - pthread_mutex_lock(&Fabric->cq_manual_progress.pending_fi_reads_mutex); - - post_read->next = Fabric->cq_manual_progress.pending_fi_reads; - Fabric->cq_manual_progress.pending_fi_reads = post_read; + if (rc != 0) + { + Svcs->verbose(RS_Stream->CP_Stream, DPCriticalVerbose, "fi_read failed with code %d.\n", + rc); + return (rc); + } + else + { - pthread_mutex_unlock(&Fabric->cq_manual_progress.pending_fi_reads_mutex); + Svcs->verbose(RS_Stream->CP_Stream, DPTraceVerbose, + "Posted RDMA get for Writer Rank %d for handle %p\n", Rank, (void *)ret); + RS_Stream->PendingReads++; } - // @todo: replace with an atomic and update in the thread? - // better safeguard against failures: if a read fails, we should not count it - RS_Stream->PendingReads++; - return (0); + return (rc); } static RdmaBuffer GetRequest(Rdma_RS_Stream Stream, RdmaStepLogEntry StepLog, int Rank, From 97f4d36e3d3d913c9b40c486a373af66268d68be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Tue, 12 Dec 2023 17:31:49 +0100 Subject: [PATCH 21/50] Memory management --- source/adios2/toolkit/sst/dp/rdma_dp.c | 53 +++++++++++++++++--------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 196cb70695..efb4fc5e3c 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -8,8 +8,6 @@ #include #include "adios2/common/ADIOSConfig.h" -#include "rdma/fi_eq.h" -#include "rdma/fi_errno.h" #include #include @@ -203,7 +201,6 @@ static void *make_progress(void *params_) fi_strerror(error.err), fi_cq_strerror(params->cq_signal, error.err, error.err_data, NULL, error.len)); } - printf("Backung off for %d seconds\n", current_backoff_seconds); sleep(current_backoff_seconds); if(current_backoff_seconds < SST_BACKOFF_SECONDS_MAX) { @@ -248,7 +245,7 @@ struct fabric_state uint32_t credential; struct fi_gni_auth_key *auth_key; #endif /* SST_HAVE_CRAY_DRC */ - struct cq_manual_progress cq_manual_progress; + struct cq_manual_progress *cq_manual_progress; pthread_t pthread_id; }; @@ -257,7 +254,7 @@ void cq_read(struct fabric_state *fabric, struct fi_cq_data_entry *CQEntry) unsigned int current_backoff_seconds = 0; while (1) { - struct fi_cq_data_entry *res = cq_manual_progress_pop(&fabric->cq_manual_progress); + struct fi_cq_data_entry *res = cq_manual_progress_pop(fabric->cq_manual_progress); if (res == NULL) { sleep(current_backoff_seconds); @@ -647,18 +644,24 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, fi_freeinfo(originfo); - fabric->cq_manual_progress.cq_signal = fabric->cq_signal; - if (pthread_mutex_init(&fabric->cq_manual_progress.cq_event_list_mutex, NULL) != 0) + fabric->cq_manual_progress = NULL; + + struct cq_manual_progress *manual_progress = malloc(sizeof(struct cq_manual_progress)); + + manual_progress->cq_signal = fabric->cq_signal; + if (pthread_mutex_init(&manual_progress->cq_event_list_mutex, NULL) != 0) { Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not init mutex.\n"); return; } - fabric->cq_manual_progress.cq_event_list = NULL; - fabric->cq_manual_progress.Svcs = Svcs; - fabric->cq_manual_progress.Stream = CP_Stream; - fabric->cq_manual_progress.do_continue = 1; + manual_progress->cq_event_list = NULL; + manual_progress->Svcs = Svcs; + manual_progress->Stream = CP_Stream; + manual_progress->do_continue = 1; - if (pthread_create(&fabric->pthread_id, NULL, &make_progress, &fabric->cq_manual_progress) != 0) + fabric->cq_manual_progress = manual_progress; + + if (pthread_create(&fabric->pthread_id, NULL, &make_progress, fabric->cq_manual_progress) != 0) { Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); return; @@ -668,13 +671,27 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, static void fini_fabric(struct fabric_state *fabric, CP_Services Svcs, void *CP_Stream) { - fabric->cq_manual_progress.do_continue = 0; - // free other stuff - - if (pthread_join(fabric->pthread_id, NULL) != 0) + if (fabric->cq_manual_progress) { - Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not join thread.\n"); - return; + + fabric->cq_manual_progress->do_continue = 0; + + if (pthread_join(fabric->pthread_id, NULL) != 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not join thread.\n"); + return; + } + + pthread_mutex_destroy(&fabric->cq_manual_progress->cq_event_list_mutex); + + struct cq_event_list *head = fabric->cq_manual_progress->cq_event_list; + while (head) + { + struct cq_event_list *next = head->next; + free(head); + head = next; + } + free(fabric->cq_manual_progress); } int res; From 9878ba5b0acc81ce97b3622cc02c37f6f1e32a66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Thu, 14 Dec 2023 16:45:44 +0100 Subject: [PATCH 22/50] Use blocking fi_cq_sread() in the worker thread It currently seems that the worker threads don't successfully finalize.. --- source/adios2/toolkit/sst/dp/rdma_dp.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index efb4fc5e3c..21b173aab9 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -184,11 +184,15 @@ static void *make_progress(void *params_) struct cq_manual_progress *params = (struct cq_manual_progress *)params_; struct fi_cq_data_entry *CQEntry = malloc(sizeof(struct fi_cq_data_entry)); - unsigned int current_backoff_seconds = 0; - while (params->do_continue) { - ssize_t rc = fi_cq_read(params->cq_signal, (void *)CQEntry, 1); + /* + * The main purpose of this worker thread is to make repeated blocking calls to the blocking + * fi_cq_sread() with a timeout of 5 seconds. Some providers don't make progress in a timely + * fashion otherwise (e.g. shm). + */ + ssize_t rc = + fi_cq_sread(params->cq_signal, (void *)CQEntry, 1, NULL, SST_BACKOFF_SECONDS_MAX); if (rc < 1) { struct fi_cq_err_entry error = {.err = 0}; @@ -201,11 +205,6 @@ static void *make_progress(void *params_) fi_strerror(error.err), fi_cq_strerror(params->cq_signal, error.err, error.err_data, NULL, error.len)); } - sleep(current_backoff_seconds); - if(current_backoff_seconds < SST_BACKOFF_SECONDS_MAX) - { - ++current_backoff_seconds; - } } else { @@ -214,7 +213,6 @@ static void *make_progress(void *params_) next_item->next = NULL; cq_manual_progress_push(params, next_item); CQEntry = malloc(sizeof(struct fi_cq_data_entry)); - current_backoff_seconds = 0; } } return NULL; From 594b2e66c93c1716f26bede7a0a2566be043b154 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 15 Dec 2023 12:27:13 +0100 Subject: [PATCH 23/50] Batch processing in fi_cq_sread() --- source/adios2/toolkit/sst/dp/rdma_dp.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 21b173aab9..a3add01824 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -182,7 +182,8 @@ struct fi_cq_data_entry *cq_manual_progress_pop(struct cq_manual_progress *self) static void *make_progress(void *params_) { struct cq_manual_progress *params = (struct cq_manual_progress *)params_; - struct fi_cq_data_entry *CQEntry = malloc(sizeof(struct fi_cq_data_entry)); + size_t const batch_size = 100; + struct fi_cq_data_entry *CQEntries = malloc(batch_size * sizeof(struct fi_cq_data_entry)); while (params->do_continue) { @@ -191,8 +192,10 @@ static void *make_progress(void *params_) * fi_cq_sread() with a timeout of 5 seconds. Some providers don't make progress in a timely * fashion otherwise (e.g. shm). */ - ssize_t rc = - fi_cq_sread(params->cq_signal, (void *)CQEntry, 1, NULL, SST_BACKOFF_SECONDS_MAX); + printf("Going into fi_cq_sread()\n"); + ssize_t rc = fi_cq_sread(params->cq_signal, (void *)CQEntries, batch_size, NULL, + SST_BACKOFF_SECONDS_MAX * 1000); + printf("fi_cq_sread()=%ld\n", rc); if (rc < 1) { struct fi_cq_err_entry error = {.err = 0}; @@ -208,13 +211,19 @@ static void *make_progress(void *params_) } else { - struct cq_event_list *next_item = malloc(sizeof(struct cq_event_list)); - next_item->value = CQEntry; - next_item->next = NULL; - cq_manual_progress_push(params, next_item); - CQEntry = malloc(sizeof(struct fi_cq_data_entry)); + for (size_t i = 0; i < rc; ++i) + { + struct cq_event_list *next_item = malloc(sizeof(struct cq_event_list)); + struct fi_cq_data_entry * value = malloc(sizeof(struct fi_cq_data_entry)); + memcpy(value, &CQEntries[i], sizeof(struct fi_cq_data_entry)); + next_item->value = value; + next_item->next = NULL; + cq_manual_progress_push(params, next_item); + } } } + free(CQEntries); + printf("Returning from thread\n"); return NULL; } From fe839f2b758fe09a87ed643100ef0192fd139bb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 15 Dec 2023 15:04:02 +0100 Subject: [PATCH 24/50] Yield to scheduler in busy loop --- source/adios2/toolkit/sst/dp/rdma_dp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index a3add01824..e9971168b2 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -1707,8 +1708,20 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo "Remote read target is Rank %d (Offset = %zi, Length = %zi)\n", Rank, Offset, Length); + /* + * The shm provider often returns -EAGAIN thousands of times. + * Yielding the thread after a certain number of attempts helps reduce context switches. + */ + size_t const batch_size = 2000; + size_t counter = 1; do { + if (counter++ >= batch_size) + { + sched_yield(); + counter = 0; + } + ++counter; rc = fi_read(Fabric->signal, Buffer, Length, LocalDesc, SrcAddress, (uint64_t)Addr, Info->Key, ret); } while (rc == -EAGAIN); From 6fc2b2e3cc1392dec202fbbe54d8b2ed562bbd64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Fri, 15 Dec 2023 15:19:10 +0100 Subject: [PATCH 25/50] Add FABRIC_PROVIDER environment variable --- source/adios2/toolkit/sst/dp/rdma_dp.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index e9971168b2..b0b3fe9190 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -411,6 +411,14 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, fabric->info = NULL; + char const * provider_name = NULL; + if((provider_name = getenv("FABRIC_PROVIDER"))) + { + size_t len = strlen(provider_name); + hints->fabric_attr->prov_name = malloc(len + 1); + memcpy(hints->fabric_attr->prov_name, provider_name, len + 1); + } + pthread_mutex_lock(&fabric_mutex); fi_getinfo(fi_version, NULL, NULL, 0, hints, &info); pthread_mutex_unlock(&fabric_mutex); From e347a5ee82d7937d954ee9a97c3fcb7de613ae80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 22 Jan 2024 19:34:45 +0100 Subject: [PATCH 26/50] Use infinite timeout together with signal --- source/adios2/toolkit/sst/dp/rdma_dp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index b0b3fe9190..6b79185a2e 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -194,8 +194,7 @@ static void *make_progress(void *params_) * fashion otherwise (e.g. shm). */ printf("Going into fi_cq_sread()\n"); - ssize_t rc = fi_cq_sread(params->cq_signal, (void *)CQEntries, batch_size, NULL, - SST_BACKOFF_SECONDS_MAX * 1000); + ssize_t rc = fi_cq_sread(params->cq_signal, (void *)CQEntries, batch_size, NULL, -1); printf("fi_cq_sread()=%ld\n", rc); if (rc < 1) { @@ -691,6 +690,7 @@ static void fini_fabric(struct fabric_state *fabric, CP_Services Svcs, void *CP_ { fabric->cq_manual_progress->do_continue = 0; + fi_cq_signal(fabric->cq_signal); if (pthread_join(fabric->pthread_id, NULL) != 0) { From 009972c3b721bbe78fc643684635edaa5672f8f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Wed, 17 Jan 2024 18:25:07 +0100 Subject: [PATCH 27/50] Verbose logging --- source/adios2/toolkit/sst/dp/ucx_dp.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/source/adios2/toolkit/sst/dp/ucx_dp.c b/source/adios2/toolkit/sst/dp/ucx_dp.c index dc51937a50..4bb41ceb79 100644 --- a/source/adios2/toolkit/sst/dp/ucx_dp.c +++ b/source/adios2/toolkit/sst/dp/ucx_dp.c @@ -98,6 +98,7 @@ struct fabric_state static ucs_status_t init_fabric(struct fabric_state *fabric, struct _SstParams *Params, CP_Services Svcs, void *CP_Stream) { + printf("CALL init_fabric\n"); ucp_params_t ucp_params; ucp_worker_params_t worker_params; ucp_config_t *config; @@ -153,6 +154,7 @@ static ucs_status_t init_fabric(struct fabric_state *fabric, struct _SstParams * static void fini_fabric(struct fabric_state *fabric) { + printf("CALL fini_fabric\n"); ucp_worker_destroy(fabric->ucp_worker); ucp_cleanup(fabric->ucp_context); } @@ -246,6 +248,7 @@ static DP_RS_Stream UcxInitReader(CP_Services Svcs, void *CP_Stream, void **Read struct _SstParams *Params, attr_list WriterContact, SstStats Stats) { + printf("CALL UcxInitReader\n"); Ucx_RS_Stream Stream = malloc(sizeof(struct _Ucx_RS_Stream)); SMPI_Comm comm = Svcs->getMPIComm(CP_Stream); ucs_status_t status; @@ -285,6 +288,7 @@ static DP_RS_Stream UcxInitReader(CP_Services Svcs, void *CP_Stream, void **Read static DP_WS_Stream UcxInitWriter(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params, attr_list DPAttrs, SstStats Stats) { + printf("CALL UcxInitWriter\n"); Ucx_WS_Stream Stream = malloc(sizeof(struct _Ucx_WS_Stream)); SMPI_Comm comm = Svcs->getMPIComm(CP_Stream); ucs_status_t status; @@ -311,6 +315,7 @@ static DP_WSR_Stream UcxInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_St void **providedReaderInfo_v, void **WriterContactInfoPtr) { + printf("CALL UcxInitWriterPerReader\n"); Ucx_WS_Stream WS_Stream = (Ucx_WS_Stream)WS_Stream_v; Ucx_WSR_Stream WSR_Stream = malloc(sizeof(*WSR_Stream)); FabricState Fabric = WS_Stream->Fabric; @@ -348,6 +353,7 @@ static void UcxProvideWriterDataToReader(CP_Services Svcs, DP_RS_Stream RS_Strea int writerCohortSize, CP_PeerCohort PeerCohort, void **providedWriterInfo_v) { + printf("CALL UcxProvideWriterDataToReader\n"); Ucx_RS_Stream RS_Stream = (Ucx_RS_Stream)RS_Stream_v; FabricState Fabric = RS_Stream->Fabric; UcxWriterContactInfo *providedWriterInfo = (UcxWriterContactInfo *)providedWriterInfo_v; @@ -385,6 +391,7 @@ static void UcxProvideWriterDataToReader(CP_Services Svcs, DP_RS_Stream RS_Strea static void *UcxReadRemoteMemory(CP_Services Svcs, DP_RS_Stream Stream_v, int Rank, long Timestep, size_t Offset, size_t Length, void *Buffer, void *DP_TimestepInfo) { + printf("CALL UcxReadRemoteMemory\n"); Ucx_RS_Stream RS_Stream = (Ucx_RS_Stream)Stream_v; UcxBufferHandle Info = (UcxBufferHandle)DP_TimestepInfo; uint8_t *Addr; @@ -454,6 +461,7 @@ static void *UcxReadRemoteMemory(CP_Services Svcs, DP_RS_Stream Stream_v, int Ra static void UcxNotifyConnFailure(CP_Services Svcs, DP_RS_Stream Stream_v, int FailedPeerRank) { + printf("CALL UcxNotifyConnFailure\n"); /* DP_RS_Stream is the return from InitReader */ Ucx_RS_Stream Stream = (Ucx_RS_Stream)Stream_v; Svcs->verbose(Stream->CP_Stream, DPTraceVerbose, @@ -469,6 +477,7 @@ static void UcxNotifyConnFailure(CP_Services Svcs, DP_RS_Stream Stream_v, int Fa */ static int UcxWaitForCompletion(CP_Services Svcs, void *Handle_v) { + printf("CALL UcxWaitForCompletion\n"); UcxCompletionHandle Handle = (UcxCompletionHandle)Handle_v; Ucx_RS_Stream Stream = Handle->CPStream; ucs_status_t status = UCS_ERR_LAST; @@ -505,6 +514,7 @@ static void UcxProvideTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, struct _ struct _SstData *LocalMetadata, long Timestep, void **TimestepInfoPtr) { + printf("CALL UcxProvideTimestep\n"); Ucx_WS_Stream Stream = (Ucx_WS_Stream)Stream_v; TimestepList Entry = malloc(sizeof(struct _TimestepEntry)); UcxBufferHandle Info = malloc(sizeof(struct _UcxBufferHandle)); @@ -572,6 +582,7 @@ static void UcxProvideTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, struct _ static void UcxReleaseTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, long Timestep) { + printf("CALL UcxReleaseTimestep\n"); Ucx_WS_Stream Stream = (Ucx_WS_Stream)Stream_v; FabricState Fabric = Stream->Fabric; TimestepList *List = &Stream->Timesteps; @@ -616,6 +627,7 @@ static void UcxReleaseTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, long Tim static void UcxDestroyReader(CP_Services Svcs, DP_RS_Stream RS_Stream_v) { + printf("CALL UcxDestroyReader\n"); Ucx_RS_Stream RS_Stream = (Ucx_RS_Stream)RS_Stream_v; Svcs->verbose(RS_Stream->CP_Stream, DPTraceVerbose, "Tearing down RDMA state on reader.\n"); @@ -630,6 +642,7 @@ static void UcxDestroyReader(CP_Services Svcs, DP_RS_Stream RS_Stream_v) static void UcxDestroyWriterPerReader(CP_Services Svcs, DP_WSR_Stream WSR_Stream_v) { + printf("CALL UcxDestroyWriterPerReader\n"); Ucx_WSR_Stream WSR_Stream = {0}; memcpy(&WSR_Stream, &WSR_Stream_v, sizeof(Ucx_WSR_Stream)); Ucx_WS_Stream WS_Stream = WSR_Stream->WS_Stream; @@ -677,6 +690,7 @@ static FMStructDescRec UcxBufferHandleStructs[] = { static void UcxDestroyWriter(CP_Services Svcs, DP_WS_Stream WS_Stream_v) { + printf("CALL UcxDestroyWriter\n"); Ucx_WS_Stream WS_Stream = (Ucx_WS_Stream)WS_Stream_v; long Timestep; @@ -724,6 +738,7 @@ static struct _CP_DP_Interface UcxDPInterface = {0}; static int UcxGetPriority(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params) { + printf("CALL UcxGetPriority\n"); /* TODO: Improve priority algorithm */ int ux_dp_priority = 10; @@ -736,12 +751,14 @@ static int UcxGetPriority(CP_Services Svcs, void *CP_Stream, struct _SstParams * */ static void UcxUnGetPriority(CP_Services Svcs, void *CP_Stream) { + printf("CALL UcxUnGetPriority\n"); Svcs->verbose(CP_Stream, DPPerStepVerbose, "UCX Dataplane unloading\n"); } static void UcxTimestepArrived(CP_Services Svcs, DP_RS_Stream Stream_v, long Timestep, SstPreloadModeType PreloadMode) { + printf("CALL UcxTimestepArrived\n"); Ucx_RS_Stream Stream = (Ucx_RS_Stream)Stream_v; Svcs->verbose(Stream->CP_Stream, DPTraceVerbose, "%s with Timestep = %li, PreloadMode = %d\n", @@ -750,6 +767,7 @@ static void UcxTimestepArrived(CP_Services Svcs, DP_RS_Stream Stream_v, long Tim extern NO_SANITIZE_THREAD CP_DP_Interface LoadUcxDP() { + printf("CALL LoadUcxDP\n"); UcxDPInterface.DPName = "ucx"; UcxDPInterface.ReaderContactFormats = UcxReaderContactStructs; UcxDPInterface.WriterContactFormats = UcxWriterContactStructs; From 3fc8de4461c7cba1a48760fc01e7715a57e4f46c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 22 Jan 2024 16:54:44 +0100 Subject: [PATCH 28/50] Revert "Verbose logging" This reverts commit 553f605740a1a7fb82635ffbbbe0aca88ca6f848. --- source/adios2/toolkit/sst/dp/ucx_dp.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/ucx_dp.c b/source/adios2/toolkit/sst/dp/ucx_dp.c index 4bb41ceb79..dc51937a50 100644 --- a/source/adios2/toolkit/sst/dp/ucx_dp.c +++ b/source/adios2/toolkit/sst/dp/ucx_dp.c @@ -98,7 +98,6 @@ struct fabric_state static ucs_status_t init_fabric(struct fabric_state *fabric, struct _SstParams *Params, CP_Services Svcs, void *CP_Stream) { - printf("CALL init_fabric\n"); ucp_params_t ucp_params; ucp_worker_params_t worker_params; ucp_config_t *config; @@ -154,7 +153,6 @@ static ucs_status_t init_fabric(struct fabric_state *fabric, struct _SstParams * static void fini_fabric(struct fabric_state *fabric) { - printf("CALL fini_fabric\n"); ucp_worker_destroy(fabric->ucp_worker); ucp_cleanup(fabric->ucp_context); } @@ -248,7 +246,6 @@ static DP_RS_Stream UcxInitReader(CP_Services Svcs, void *CP_Stream, void **Read struct _SstParams *Params, attr_list WriterContact, SstStats Stats) { - printf("CALL UcxInitReader\n"); Ucx_RS_Stream Stream = malloc(sizeof(struct _Ucx_RS_Stream)); SMPI_Comm comm = Svcs->getMPIComm(CP_Stream); ucs_status_t status; @@ -288,7 +285,6 @@ static DP_RS_Stream UcxInitReader(CP_Services Svcs, void *CP_Stream, void **Read static DP_WS_Stream UcxInitWriter(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params, attr_list DPAttrs, SstStats Stats) { - printf("CALL UcxInitWriter\n"); Ucx_WS_Stream Stream = malloc(sizeof(struct _Ucx_WS_Stream)); SMPI_Comm comm = Svcs->getMPIComm(CP_Stream); ucs_status_t status; @@ -315,7 +311,6 @@ static DP_WSR_Stream UcxInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_St void **providedReaderInfo_v, void **WriterContactInfoPtr) { - printf("CALL UcxInitWriterPerReader\n"); Ucx_WS_Stream WS_Stream = (Ucx_WS_Stream)WS_Stream_v; Ucx_WSR_Stream WSR_Stream = malloc(sizeof(*WSR_Stream)); FabricState Fabric = WS_Stream->Fabric; @@ -353,7 +348,6 @@ static void UcxProvideWriterDataToReader(CP_Services Svcs, DP_RS_Stream RS_Strea int writerCohortSize, CP_PeerCohort PeerCohort, void **providedWriterInfo_v) { - printf("CALL UcxProvideWriterDataToReader\n"); Ucx_RS_Stream RS_Stream = (Ucx_RS_Stream)RS_Stream_v; FabricState Fabric = RS_Stream->Fabric; UcxWriterContactInfo *providedWriterInfo = (UcxWriterContactInfo *)providedWriterInfo_v; @@ -391,7 +385,6 @@ static void UcxProvideWriterDataToReader(CP_Services Svcs, DP_RS_Stream RS_Strea static void *UcxReadRemoteMemory(CP_Services Svcs, DP_RS_Stream Stream_v, int Rank, long Timestep, size_t Offset, size_t Length, void *Buffer, void *DP_TimestepInfo) { - printf("CALL UcxReadRemoteMemory\n"); Ucx_RS_Stream RS_Stream = (Ucx_RS_Stream)Stream_v; UcxBufferHandle Info = (UcxBufferHandle)DP_TimestepInfo; uint8_t *Addr; @@ -461,7 +454,6 @@ static void *UcxReadRemoteMemory(CP_Services Svcs, DP_RS_Stream Stream_v, int Ra static void UcxNotifyConnFailure(CP_Services Svcs, DP_RS_Stream Stream_v, int FailedPeerRank) { - printf("CALL UcxNotifyConnFailure\n"); /* DP_RS_Stream is the return from InitReader */ Ucx_RS_Stream Stream = (Ucx_RS_Stream)Stream_v; Svcs->verbose(Stream->CP_Stream, DPTraceVerbose, @@ -477,7 +469,6 @@ static void UcxNotifyConnFailure(CP_Services Svcs, DP_RS_Stream Stream_v, int Fa */ static int UcxWaitForCompletion(CP_Services Svcs, void *Handle_v) { - printf("CALL UcxWaitForCompletion\n"); UcxCompletionHandle Handle = (UcxCompletionHandle)Handle_v; Ucx_RS_Stream Stream = Handle->CPStream; ucs_status_t status = UCS_ERR_LAST; @@ -514,7 +505,6 @@ static void UcxProvideTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, struct _ struct _SstData *LocalMetadata, long Timestep, void **TimestepInfoPtr) { - printf("CALL UcxProvideTimestep\n"); Ucx_WS_Stream Stream = (Ucx_WS_Stream)Stream_v; TimestepList Entry = malloc(sizeof(struct _TimestepEntry)); UcxBufferHandle Info = malloc(sizeof(struct _UcxBufferHandle)); @@ -582,7 +572,6 @@ static void UcxProvideTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, struct _ static void UcxReleaseTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, long Timestep) { - printf("CALL UcxReleaseTimestep\n"); Ucx_WS_Stream Stream = (Ucx_WS_Stream)Stream_v; FabricState Fabric = Stream->Fabric; TimestepList *List = &Stream->Timesteps; @@ -627,7 +616,6 @@ static void UcxReleaseTimestep(CP_Services Svcs, DP_WS_Stream Stream_v, long Tim static void UcxDestroyReader(CP_Services Svcs, DP_RS_Stream RS_Stream_v) { - printf("CALL UcxDestroyReader\n"); Ucx_RS_Stream RS_Stream = (Ucx_RS_Stream)RS_Stream_v; Svcs->verbose(RS_Stream->CP_Stream, DPTraceVerbose, "Tearing down RDMA state on reader.\n"); @@ -642,7 +630,6 @@ static void UcxDestroyReader(CP_Services Svcs, DP_RS_Stream RS_Stream_v) static void UcxDestroyWriterPerReader(CP_Services Svcs, DP_WSR_Stream WSR_Stream_v) { - printf("CALL UcxDestroyWriterPerReader\n"); Ucx_WSR_Stream WSR_Stream = {0}; memcpy(&WSR_Stream, &WSR_Stream_v, sizeof(Ucx_WSR_Stream)); Ucx_WS_Stream WS_Stream = WSR_Stream->WS_Stream; @@ -690,7 +677,6 @@ static FMStructDescRec UcxBufferHandleStructs[] = { static void UcxDestroyWriter(CP_Services Svcs, DP_WS_Stream WS_Stream_v) { - printf("CALL UcxDestroyWriter\n"); Ucx_WS_Stream WS_Stream = (Ucx_WS_Stream)WS_Stream_v; long Timestep; @@ -738,7 +724,6 @@ static struct _CP_DP_Interface UcxDPInterface = {0}; static int UcxGetPriority(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params) { - printf("CALL UcxGetPriority\n"); /* TODO: Improve priority algorithm */ int ux_dp_priority = 10; @@ -751,14 +736,12 @@ static int UcxGetPriority(CP_Services Svcs, void *CP_Stream, struct _SstParams * */ static void UcxUnGetPriority(CP_Services Svcs, void *CP_Stream) { - printf("CALL UcxUnGetPriority\n"); Svcs->verbose(CP_Stream, DPPerStepVerbose, "UCX Dataplane unloading\n"); } static void UcxTimestepArrived(CP_Services Svcs, DP_RS_Stream Stream_v, long Timestep, SstPreloadModeType PreloadMode) { - printf("CALL UcxTimestepArrived\n"); Ucx_RS_Stream Stream = (Ucx_RS_Stream)Stream_v; Svcs->verbose(Stream->CP_Stream, DPTraceVerbose, "%s with Timestep = %li, PreloadMode = %d\n", @@ -767,7 +750,6 @@ static void UcxTimestepArrived(CP_Services Svcs, DP_RS_Stream Stream_v, long Tim extern NO_SANITIZE_THREAD CP_DP_Interface LoadUcxDP() { - printf("CALL LoadUcxDP\n"); UcxDPInterface.DPName = "ucx"; UcxDPInterface.ReaderContactFormats = UcxReaderContactStructs; UcxDPInterface.WriterContactFormats = UcxWriterContactStructs; From cb7808f4fbcb271cbf583aec3f55e887e93ef398 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 22 Jan 2024 16:52:26 +0100 Subject: [PATCH 29/50] Use progress thread on writer side --- source/adios2/toolkit/sst/dp/ucx_dp.c | 35 ++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/source/adios2/toolkit/sst/dp/ucx_dp.c b/source/adios2/toolkit/sst/dp/ucx_dp.c index dc51937a50..cb30831d84 100644 --- a/source/adios2/toolkit/sst/dp/ucx_dp.c +++ b/source/adios2/toolkit/sst/dp/ucx_dp.c @@ -57,6 +57,9 @@ struct fabric_state ucp_address_t *local_addr; size_t local_addr_len; + + pthread_t progress_thread; + char keep_making_progress; }; /* @@ -115,7 +118,7 @@ static ucs_status_t init_fabric(struct fabric_state *fabric, struct _SstParams * return status; } ucp_params.field_mask = UCP_PARAM_FIELD_FEATURES; - ucp_params.features = UCP_FEATURE_RMA; + ucp_params.features = UCP_FEATURE_RMA | UCP_FEATURE_WAKEUP; status = ucp_init(&ucp_params, config, &fabric->ucp_context); if (status != UCS_OK) @@ -282,6 +285,21 @@ static DP_RS_Stream UcxInitReader(CP_Services Svcs, void *CP_Stream, void **Read return Stream; } +typedef struct fabric_state progress_thread_params; + +static void *make_progress(void *params_) +{ + progress_thread_params *params = params_; + while (params->keep_making_progress) + { + ucp_worker_wait(params->ucp_worker); + while (ucp_worker_progress(params->ucp_worker) != 0) + { // go again} + } + } + return NULL; +} + static DP_WS_Stream UcxInitWriter(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params, attr_list DPAttrs, SstStats Stats) { @@ -303,6 +321,13 @@ static DP_WS_Stream UcxInitWriter(CP_Services Svcs, void *CP_Stream, struct _Sst Stream->CP_Stream = CP_Stream; + Stream->Fabric->keep_making_progress = 1; + if (pthread_create(&Stream->Fabric->progress_thread, NULL, &make_progress, Stream->Fabric) != 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); + return NULL; + } + return (void *)Stream; } @@ -701,6 +726,14 @@ static void UcxDestroyWriter(CP_Services Svcs, DP_WS_Stream WS_Stream_v) Svcs->verbose(WS_Stream->CP_Stream, DPTraceVerbose, "Tearing down RDMA state on writer.\n"); + WS_Stream->Fabric->keep_making_progress = 0; + ucp_worker_signal(WS_Stream->Fabric->ucp_worker); + if (pthread_join(WS_Stream->Fabric->progress_thread, NULL) != 0) + { + Svcs->verbose(WS_Stream, DPCriticalVerbose, "Could not join thread.\n"); + return; + } + if (WS_Stream->Fabric) { fini_fabric(WS_Stream->Fabric); From 5fd64703884302b72df5337b39b35b7c1a3e0f3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Tue, 23 Jan 2024 10:55:39 +0100 Subject: [PATCH 30/50] Add missing key destroy --- source/adios2/toolkit/sst/dp/ucx_dp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/source/adios2/toolkit/sst/dp/ucx_dp.c b/source/adios2/toolkit/sst/dp/ucx_dp.c index cb30831d84..82ca457ab4 100644 --- a/source/adios2/toolkit/sst/dp/ucx_dp.c +++ b/source/adios2/toolkit/sst/dp/ucx_dp.c @@ -463,6 +463,7 @@ static void *UcxReadRemoteMemory(CP_Services Svcs, DP_RS_Stream Stream_v, int Ra param.op_attr_mask = 0; ret->req = ucp_get_nbx(RS_Stream->WriterEP[Rank], Buffer, Length, (uint64_t)Addr, rkey_p, ¶m); + ucp_rkey_destroy(rkey_p); status = UCS_PTR_STATUS(ret->req); if (status != UCS_OK && status != UCS_INPROGRESS) { From dfc943702985c6147eccf40fb75f686041b4f4ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Tue, 23 Jan 2024 12:06:26 +0100 Subject: [PATCH 31/50] Replace sleeping with condition variables --- source/adios2/toolkit/sst/dp/rdma_dp.c | 59 ++++++++++---------------- 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 6b79185a2e..230e36c41b 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -132,8 +132,11 @@ struct cq_event_list struct cq_manual_progress { struct fid_cq *cq_signal; - pthread_mutex_t cq_event_list_mutex; + struct cq_event_list *cq_event_list; + pthread_mutex_t cq_event_list_mutex; + pthread_cond_t cq_even_list_signal; + char cq_event_list_filled; CP_Services Svcs; void *Stream; @@ -156,46 +159,43 @@ void cq_manual_progress_push(struct cq_manual_progress *self, struct cq_event_li } head->next = item; } + self->cq_event_list_filled = 1; pthread_mutex_unlock(&self->cq_event_list_mutex); + pthread_cond_signal(&self->cq_even_list_signal); } struct fi_cq_data_entry *cq_manual_progress_pop(struct cq_manual_progress *self) { struct fi_cq_data_entry *res; pthread_mutex_lock(&self->cq_event_list_mutex); - if (!self->cq_event_list) + while (!self->cq_event_list_filled) { - res = NULL; - } - else - { - struct cq_event_list *head = self->cq_event_list; - res = head->value; - self->cq_event_list = head->next; - free(head); + pthread_cond_wait(&self->cq_even_list_signal, &self->cq_event_list_mutex); } + assert(self->cq_event_list); + struct cq_event_list *head = self->cq_event_list; + res = head->value; + self->cq_event_list = head->next; + self->cq_event_list_filled = self->cq_event_list ? 1 : 0; pthread_mutex_unlock(&self->cq_event_list_mutex); + free(head); return res; } -#define SST_BACKOFF_SECONDS_MAX 5 - static void *make_progress(void *params_) { struct cq_manual_progress *params = (struct cq_manual_progress *)params_; size_t const batch_size = 100; - struct fi_cq_data_entry *CQEntries = malloc(batch_size * sizeof(struct fi_cq_data_entry)); + struct fi_cq_data_entry CQEntries[batch_size]; while (params->do_continue) { /* * The main purpose of this worker thread is to make repeated blocking calls to the blocking - * fi_cq_sread() with a timeout of 5 seconds. Some providers don't make progress in a timely - * fashion otherwise (e.g. shm). + * fi_cq_sread(). Some providers don't make progress in a timely fashion otherwise (e.g. + * shm). */ - printf("Going into fi_cq_sread()\n"); ssize_t rc = fi_cq_sread(params->cq_signal, (void *)CQEntries, batch_size, NULL, -1); - printf("fi_cq_sread()=%ld\n", rc); if (rc < 1) { struct fi_cq_err_entry error = {.err = 0}; @@ -222,8 +222,6 @@ static void *make_progress(void *params_) } } } - free(CQEntries); - printf("Returning from thread\n"); return NULL; } @@ -259,22 +257,10 @@ struct fabric_state void cq_read(struct fabric_state *fabric, struct fi_cq_data_entry *CQEntry) { unsigned int current_backoff_seconds = 0; - while (1) - { - struct fi_cq_data_entry *res = cq_manual_progress_pop(fabric->cq_manual_progress); - if (res == NULL) - { - sleep(current_backoff_seconds); - if(current_backoff_seconds < SST_BACKOFF_SECONDS_MAX) - { - ++current_backoff_seconds; - } - continue; - } - memcpy(CQEntry, res, sizeof(struct fi_cq_data_entry)); - free(res); - return; - } + struct fi_cq_data_entry *res = cq_manual_progress_pop(fabric->cq_manual_progress); + memcpy(CQEntry, res, sizeof(struct fi_cq_data_entry)); + free(res); + return; } /* @@ -670,9 +656,11 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, return; } manual_progress->cq_event_list = NULL; + manual_progress->cq_event_list_filled = 0; manual_progress->Svcs = Svcs; manual_progress->Stream = CP_Stream; manual_progress->do_continue = 1; + pthread_cond_init(&manual_progress->cq_even_list_signal, NULL); fabric->cq_manual_progress = manual_progress; @@ -1559,7 +1547,6 @@ static DP_WSR_Stream RdmaInitWriterPerReader(CP_Services Svcs, DP_WS_Stream WS_S &WSR_Stream->rrmr, Fabric->ctx, Fabric->signal, Fabric->info->domain_attr->mr_mode); ReaderRollHandle->Key = fi_mr_key(WSR_Stream->rrmr); - printf("Key: %lu\n", ReaderRollHandle->Key); WSR_Stream->WriterContactInfo = ContactInfo; From 756bb2e11089ad25d3bff203eba868e90a12d16e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Thu, 7 Mar 2024 13:51:33 +0100 Subject: [PATCH 32/50] Remove sched_yield() logic --- source/adios2/toolkit/sst/dp/rdma_dp.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 230e36c41b..9491c451d6 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -1703,20 +1703,8 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo "Remote read target is Rank %d (Offset = %zi, Length = %zi)\n", Rank, Offset, Length); - /* - * The shm provider often returns -EAGAIN thousands of times. - * Yielding the thread after a certain number of attempts helps reduce context switches. - */ - size_t const batch_size = 2000; - size_t counter = 1; do { - if (counter++ >= batch_size) - { - sched_yield(); - counter = 0; - } - ++counter; rc = fi_read(Fabric->signal, Buffer, Length, LocalDesc, SrcAddress, (uint64_t)Addr, Info->Key, ret); } while (rc == -EAGAIN); From 01fc7a1176f061e1304fb2e83e67fd20f9f97f33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Thu, 7 Mar 2024 15:58:00 +0100 Subject: [PATCH 33/50] Enqueue remote reads in batches Ref. the meeting with the Maestro team: The fabric should not be bombarded with too many requests at once. Batch size currently hardcoded as 10. --- source/adios2/engine/sst/SstReader.cpp | 69 +++++++++++++++++++------- source/adios2/engine/sst/SstReader.h | 13 ++++- source/adios2/engine/sst/SstReader.tcc | 22 ++++---- 3 files changed, 72 insertions(+), 32 deletions(-) diff --git a/source/adios2/engine/sst/SstReader.cpp b/source/adios2/engine/sst/SstReader.cpp index efb4ba511d..ee699da922 100644 --- a/source/adios2/engine/sst/SstReader.cpp +++ b/source/adios2/engine/sst/SstReader.cpp @@ -669,28 +669,46 @@ bool SstReader::VariableMinMax(const VariableBase &Var, const size_t Step, MinMa return m_BP5Deserializer->VariableMinMax(Var, Step, MinMax); } +void *SstReader::performDeferredReadRemoteMemory(DeferredReadRemoteMemory const ¶ms) +{ + return SstReadRemoteMemory(m_Input, (int)params.rank, CurrentStep(), params.payloadStart, + params.payloadSize, params.buffer, params.dp_info); +} + +constexpr static size_t BATCH_SIZE = 10; + void SstReader::BP5PerformGets() { size_t maxReadSize; auto ReadRequests = m_BP5Deserializer->GenerateReadRequests(true, &maxReadSize); std::vector sstReadHandlers; - for (const auto &Req : ReadRequests) + + auto iterator = ReadRequests.cbegin(); + auto end = ReadRequests.cend(); + while (iterator != end) { - void *dp_info = NULL; - if (m_CurrentStepMetaData->DP_TimestepInfo) + sstReadHandlers.clear(); + size_t counter = 0; + for (; counter < BATCH_SIZE && iterator != end; ++iterator, ++counter) { - dp_info = m_CurrentStepMetaData->DP_TimestepInfo[Req.WriterRank]; + auto const &Req = *iterator; + + void *dp_info = NULL; + if (m_CurrentStepMetaData->DP_TimestepInfo) + { + dp_info = m_CurrentStepMetaData->DP_TimestepInfo[Req.WriterRank]; + } + auto ret = SstReadRemoteMemory(m_Input, (int)Req.WriterRank, Req.Timestep, Req.StartOffset, + Req.ReadLength, Req.DestinationAddr, dp_info); + sstReadHandlers.push_back(ret); } - auto ret = SstReadRemoteMemory(m_Input, (int)Req.WriterRank, Req.Timestep, Req.StartOffset, - Req.ReadLength, Req.DestinationAddr, dp_info); - sstReadHandlers.push_back(ret); - } - for (const auto &i : sstReadHandlers) - { - if (SstWaitForCompletion(m_Input, i) != SstSuccess) + for (const auto &i : sstReadHandlers) { - helper::Throw("Engine", "SstReader", "BP5PerformGets", - "Writer failed before returning data"); + if (SstWaitForCompletion(m_Input, i) != SstSuccess) + { + helper::Throw("Engine", "SstReader", "BP5PerformGets", + "Writer failed before returning data"); + } } } @@ -710,7 +728,7 @@ void SstReader::PerformGets() } else if (m_WriterMarshalMethod == SstMarshalBP) { - std::vector sstReadHandlers; + std::vector sstReadHandlers; std::vector> buffers; size_t iter = 0; @@ -739,13 +757,26 @@ void SstReader::PerformGets() ADIOS2_FOREACH_STDTYPE_1ARG(declare_type) #undef declare_type } - // wait for all SstRead requests to finish - for (const auto &i : sstReadHandlers) + // run read requests in batches and wait for them to finish + auto iterator = sstReadHandlers.cbegin(); + auto end = sstReadHandlers.cend(); + std::vector enqueuedHandlers; + enqueuedHandlers.reserve(BATCH_SIZE); + while (iterator != end) { - if (SstWaitForCompletion(m_Input, i) != SstSuccess) + size_t counter = 0; + enqueuedHandlers.clear(); + for (; counter < BATCH_SIZE && iterator != end; ++iterator, ++counter) { - helper::Throw("Engine", "SstReader", "PerformGets", - "Writer failed before returning data"); + enqueuedHandlers.push_back(performDeferredReadRemoteMemory(*iterator)); + } + for (const auto &i : enqueuedHandlers) + { + if (SstWaitForCompletion(m_Input, i) != SstSuccess) + { + helper::Throw("Engine", "SstReader", "PerformGets", + "Writer failed before returning data"); + } } } diff --git a/source/adios2/engine/sst/SstReader.h b/source/adios2/engine/sst/SstReader.h index fa0aeca8b2..a0b5160ed7 100644 --- a/source/adios2/engine/sst/SstReader.h +++ b/source/adios2/engine/sst/SstReader.h @@ -54,8 +54,19 @@ class SstReader : public Engine bool VariableMinMax(const VariableBase &, const size_t Step, MinMaxStruct &MinMax); private: + struct DeferredReadRemoteMemory + { + size_t rank; + size_t payloadStart; + size_t payloadSize; + char *buffer; + void *dp_info; + }; + void * performDeferredReadRemoteMemory(DeferredReadRemoteMemory const &); + template - void ReadVariableBlocksRequests(Variable &variable, std::vector &sstReadHandlers, + void ReadVariableBlocksRequests(Variable &variable, + std::vector &sstReadHandlers, std::vector> &buffers); template diff --git a/source/adios2/engine/sst/SstReader.tcc b/source/adios2/engine/sst/SstReader.tcc index 39844b1c75..0c864da160 100644 --- a/source/adios2/engine/sst/SstReader.tcc +++ b/source/adios2/engine/sst/SstReader.tcc @@ -15,6 +15,7 @@ #include "adios2/helper/adiosFunctions.h" //GetDataType #include +#include namespace adios2 { @@ -25,7 +26,7 @@ namespace engine template void SstReader::ReadVariableBlocksRequests(Variable &variable, - std::vector &sstReadHandlers, + std::vector &sstReadHandlers, std::vector> &buffers) { PERFSTUBS_SCOPED_TIMER_FUNC(); @@ -65,9 +66,8 @@ void SstReader::ReadVariableBlocksRequests(Variable &variable, std::stringstream ss; ss << "SST Bytes Read from remote rank " << rank; PERFSTUBS_SAMPLE_COUNTER(ss.str().c_str(), payloadSize); - auto ret = SstReadRemoteMemory(m_Input, (int)rank, CurrentStep(), payloadStart, - payloadSize, buffer, dp_info); - sstReadHandlers.push_back(ret); + sstReadHandlers.push_back( + DeferredReadRemoteMemory{rank, payloadStart, payloadSize, buffer, dp_info}); } // if remote data buffer is not compressed else @@ -87,10 +87,9 @@ void SstReader::ReadVariableBlocksRequests(Variable &variable, subStreamInfo.IntersectionBox, m_BP3Deserializer->m_IsRowMajor, elementOffset)) { - auto ret = SstReadRemoteMemory(m_Input, (int)rank, CurrentStep(), - writerBlockStart, writerBlockSize, - blockInfo.Data + elementOffset, dp_info); - sstReadHandlers.push_back(ret); + sstReadHandlers.push_back(DeferredReadRemoteMemory{ + rank, writerBlockStart, writerBlockSize, + reinterpret_cast(blockInfo.Data + elementOffset), dp_info}); } // if either input or output is not contiguous memory then // find all contiguous parts. @@ -99,10 +98,9 @@ void SstReader::ReadVariableBlocksRequests(Variable &variable, // batch all read requests buffers.emplace_back(); buffers.back().resize(writerBlockSize); - auto ret = - SstReadRemoteMemory(m_Input, (int)rank, CurrentStep(), writerBlockStart, - writerBlockSize, buffers.back().data(), dp_info); - sstReadHandlers.push_back(ret); + sstReadHandlers.push_back( + DeferredReadRemoteMemory{rank, writerBlockStart, writerBlockSize, + buffers.back().data(), dp_info}); } } ++threadID; From a17ab8dc209ffa31f20306b000910db18cfda71c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Thu, 4 Apr 2024 11:44:33 -0400 Subject: [PATCH 34/50] Tmp: Ignore unreachable endpoints in UCX Todo: Better than doing this, initialize endpoints on demand only --- source/adios2/toolkit/sst/dp/ucx_dp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/ucx_dp.c b/source/adios2/toolkit/sst/dp/ucx_dp.c index 82ca457ab4..f364ae4115 100644 --- a/source/adios2/toolkit/sst/dp/ucx_dp.c +++ b/source/adios2/toolkit/sst/dp/ucx_dp.c @@ -397,9 +397,8 @@ static void UcxProvideWriterDataToReader(CP_Services Svcs, DP_RS_Stream RS_Strea if (status != UCS_OK) { Svcs->verbose(RS_Stream->CP_Stream, DPCriticalVerbose, - "UCX Error during ucp_ep_create() with: %s.\n", + "UCX Error during ucp_ep_create() with: %s. Let's ignore....\n", ucs_status_string(status)); - return; } Svcs->verbose(RS_Stream->CP_Stream, DPTraceVerbose, "Received contact info for WS_stream %p, WSR Rank %d\n", From c57a7b26c5d29d9939b9ebf51cdf8a57b6265d18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 22 Jul 2024 17:52:03 +0200 Subject: [PATCH 35/50] Only use thread in libfabric DP when needed --- source/adios2/toolkit/sst/dp/rdma_dp.c | 100 +++++++++++++++++-------- 1 file changed, 70 insertions(+), 30 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 9491c451d6..af680daba9 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -254,13 +254,32 @@ struct fabric_state pthread_t pthread_id; }; -void cq_read(struct fabric_state *fabric, struct fi_cq_data_entry *CQEntry) +void cq_read(struct fabric_state *fabric, struct fi_cq_data_entry *CQEntry, CP_Services Svcs, + void *Stream) { - unsigned int current_backoff_seconds = 0; - struct fi_cq_data_entry *res = cq_manual_progress_pop(fabric->cq_manual_progress); - memcpy(CQEntry, res, sizeof(struct fi_cq_data_entry)); - free(res); - return; + if (fabric->cq_manual_progress) + { + struct fi_cq_data_entry *res = cq_manual_progress_pop(fabric->cq_manual_progress); + memcpy(CQEntry, res, sizeof(struct fi_cq_data_entry)); + free(res); + } + else + { + ssize_t rc = fi_cq_sread(fabric->cq_signal, (void *)CQEntry, 1, NULL, -1); + if (rc < 1) + { + struct fi_cq_err_entry error = {.err = 0}; + fi_cq_readerr(fabric->cq_signal, &error, 0); + if (error.err != -FI_SUCCESS) + { + Svcs->verbose( + Stream, DPCriticalVerbose, + "[PullSelection] no completion event (%d (%s - %s)).\n", rc, + fi_strerror(error.err), + fi_cq_strerror(fabric->cq_signal, error.err, error.err_data, NULL, error.len)); + } + } + } } /* @@ -643,32 +662,45 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, return; } - fi_freeinfo(originfo); - fabric->cq_manual_progress = NULL; - struct cq_manual_progress *manual_progress = malloc(sizeof(struct cq_manual_progress)); - - manual_progress->cq_signal = fabric->cq_signal; - if (pthread_mutex_init(&manual_progress->cq_event_list_mutex, NULL) != 0) + if (info->domain_attr->data_progress == FI_PROGRESS_MANUAL) { - Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not init mutex.\n"); - return; - } - manual_progress->cq_event_list = NULL; - manual_progress->cq_event_list_filled = 0; - manual_progress->Svcs = Svcs; - manual_progress->Stream = CP_Stream; - manual_progress->do_continue = 1; - pthread_cond_init(&manual_progress->cq_even_list_signal, NULL); + Svcs->verbose( + CP_Stream, DPTraceVerbose, + "Using a separate thread to comply with the fabric's manual progress preference.\n"); - fabric->cq_manual_progress = manual_progress; + struct cq_manual_progress *manual_progress = malloc(sizeof(struct cq_manual_progress)); - if (pthread_create(&fabric->pthread_id, NULL, &make_progress, fabric->cq_manual_progress) != 0) + manual_progress->cq_signal = fabric->cq_signal; + if (pthread_mutex_init(&manual_progress->cq_event_list_mutex, NULL) != 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not init mutex.\n"); + return; + } + manual_progress->cq_event_list = NULL; + manual_progress->cq_event_list_filled = 0; + manual_progress->Svcs = Svcs; + manual_progress->Stream = CP_Stream; + manual_progress->do_continue = 1; + pthread_cond_init(&manual_progress->cq_even_list_signal, NULL); + + fabric->cq_manual_progress = manual_progress; + + if (pthread_create(&fabric->pthread_id, NULL, &make_progress, fabric->cq_manual_progress) != + 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); + return; + } + } + else { - Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); - return; + Svcs->verbose(CP_Stream, DPTraceVerbose, + "Using the fabric's automatic progress capability.\n"); } + + fi_freeinfo(originfo); } static void fini_fabric(struct fabric_state *fabric, CP_Services Svcs, void *CP_Stream) @@ -1860,7 +1892,7 @@ static int DoPushWait(CP_Services Svcs, Rdma_RS_Stream Stream, RdmaCompletionHan while (Handle->Pending > 0) { - cq_read(Fabric, &CQEntry); + cq_read(Fabric, &CQEntry, Svcs, Stream); if (CQEntry.flags & FI_REMOTE_CQ_DATA) { BufferSlot = CQEntry.data >> 31; @@ -1926,7 +1958,7 @@ static int WaitForAnyPull(CP_Services Svcs, Rdma_RS_Stream Stream) RdmaCompletionHandle Handle_t; struct fi_cq_data_entry CQEntry = {0}; - cq_read(Fabric, &CQEntry); + cq_read(Fabric, &CQEntry, Svcs, Stream); { Svcs->verbose(Stream->CP_Stream, DPTraceVerbose, "got completion for request with handle %p (flags %li).\n", @@ -2301,6 +2333,14 @@ static int RdmaGetPriority(CP_Services Svcs, void *CP_Stream, struct _SstParams ifname = get_preferred_domain(Params); + char const *provider_name = NULL; + if ((provider_name = getenv("FABRIC_PROVIDER"))) + { + size_t len = strlen(provider_name); + hints->fabric_attr->prov_name = malloc(len + 1); + memcpy(hints->fabric_attr->prov_name, provider_name, len + 1); + } + forkunsafe = getenv("FI_FORK_UNSAFE"); if (!forkunsafe) { @@ -2588,7 +2628,7 @@ static void PostPreload(CP_Services Svcs, Rdma_RS_Stream Stream, long Timestep) while (WRidx > 0) { - cq_read(Fabric, &CQEntry); + cq_read(Fabric, &CQEntry, Svcs, Stream); CQBuffer = CQEntry.op_context; if (CQBuffer >= SendBuffer && CQBuffer < (SendBuffer + StepLog->WRanks)) { @@ -2715,7 +2755,7 @@ static void PullSelection(CP_Services Svcs, Rdma_WSR_Stream Stream) RankReq = Stream->PreloadReq; while (RankReq) { - cq_read(Fabric, &CQEntry); + cq_read(Fabric, &CQEntry, Svcs, Stream); CQRankReq = CQEntry.op_context; if (CQEntry.flags & FI_READ) { @@ -2747,7 +2787,7 @@ static void CompletePush(CP_Services Svcs, Rdma_WSR_Stream Stream, TimestepList while (Step->OutstandingWrites > 0) { - cq_read(Fabric, &CQEntry); + cq_read(Fabric, &CQEntry, Svcs, Stream); if (CQEntry.flags & FI_WRITE) { CQTimestep = (long)CQEntry.op_context; From dbb1acecd3c57a0dd6b5baf27f2f3c620117a868 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Tue, 23 Jul 2024 13:37:59 +0200 Subject: [PATCH 36/50] Parameterize progress thread in UCX --- source/adios2/toolkit/sst/dp/ucx_dp.c | 50 +++++++++++++++++++++------ 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/ucx_dp.c b/source/adios2/toolkit/sst/dp/ucx_dp.c index f364ae4115..43a6600590 100644 --- a/source/adios2/toolkit/sst/dp/ucx_dp.c +++ b/source/adios2/toolkit/sst/dp/ucx_dp.c @@ -17,6 +17,7 @@ */ #include +#include #include #include #include @@ -292,7 +293,6 @@ static void *make_progress(void *params_) progress_thread_params *params = params_; while (params->keep_making_progress) { - ucp_worker_wait(params->ucp_worker); while (ucp_worker_progress(params->ucp_worker) != 0) { // go again} } @@ -306,6 +306,9 @@ static DP_WS_Stream UcxInitWriter(CP_Services Svcs, void *CP_Stream, struct _Sst Ucx_WS_Stream Stream = malloc(sizeof(struct _Ucx_WS_Stream)); SMPI_Comm comm = Svcs->getMPIComm(CP_Stream); ucs_status_t status; + char const *use_progress_thread_envvar; + size_t const max_len = 4; + char use_progress_thread[max_len]; memset(Stream, 0, sizeof(struct _Ucx_WS_Stream)); @@ -321,11 +324,35 @@ static DP_WS_Stream UcxInitWriter(CP_Services Svcs, void *CP_Stream, struct _Sst Stream->CP_Stream = CP_Stream; - Stream->Fabric->keep_making_progress = 1; - if (pthread_create(&Stream->Fabric->progress_thread, NULL, &make_progress, Stream->Fabric) != 0) + /* + * `export UCX_POSIX_USE_PROC_LINK=n` might be necessary to make this work. + */ + use_progress_thread_envvar = getenv("UCX_WRITER_PROGRESS_THREAD"); + if (use_progress_thread_envvar) { - Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); - return NULL; + strncpy(use_progress_thread, use_progress_thread_envvar, max_len); + } + + for (size_t i = 0; i < max_len; ++i) + { + use_progress_thread[i] = (char)tolower((int)use_progress_thread[i]); + } + + if (use_progress_thread_envvar && strncmp(use_progress_thread, "1", max_len) == 0 || + strncmp(use_progress_thread, "yes", max_len) == 0 || + strncmp(use_progress_thread, "on", max_len) == 0) + { + Stream->Fabric->keep_making_progress = 1; + if (pthread_create(&Stream->Fabric->progress_thread, NULL, &make_progress, + Stream->Fabric) != 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); + return NULL; + } + } + else + { + Stream->Fabric->keep_making_progress = 0; } return (void *)Stream; @@ -726,12 +753,15 @@ static void UcxDestroyWriter(CP_Services Svcs, DP_WS_Stream WS_Stream_v) Svcs->verbose(WS_Stream->CP_Stream, DPTraceVerbose, "Tearing down RDMA state on writer.\n"); - WS_Stream->Fabric->keep_making_progress = 0; - ucp_worker_signal(WS_Stream->Fabric->ucp_worker); - if (pthread_join(WS_Stream->Fabric->progress_thread, NULL) != 0) + if(WS_Stream->Fabric->keep_making_progress == 1) { - Svcs->verbose(WS_Stream, DPCriticalVerbose, "Could not join thread.\n"); - return; + WS_Stream->Fabric->keep_making_progress = 0; + ucp_worker_signal(WS_Stream->Fabric->ucp_worker); + if (pthread_join(WS_Stream->Fabric->progress_thread, NULL) != 0) + { + Svcs->verbose(WS_Stream, DPCriticalVerbose, "Could not join thread.\n"); + return; + } } if (WS_Stream->Fabric) From b98059abb1310550e186567c310a4e14eaaee97c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Tue, 23 Jul 2024 14:16:42 +0200 Subject: [PATCH 37/50] Request multithreading support from libfabric --- source/adios2/toolkit/sst/dp/rdma_dp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index af680daba9..c18addeab2 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -344,6 +344,7 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, hints->mode = FI_CONTEXT | FI_LOCAL_MR | FI_CONTEXT2 | FI_MSG_PREFIX | FI_ASYNC_IOV | FI_RX_CQ_DATA; hints->ep_attr->type = FI_EP_RDM; + hints->domain_attr->threading = FI_THREAD_SAFE; uint32_t fi_version; #ifdef SST_HAVE_CRAY_CXI From 32135754aaa518ce114e68d92cc59368e8a42518 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Tue, 23 Jul 2024 14:30:04 +0200 Subject: [PATCH 38/50] Use progress thread only on writer side in libfabric --- source/adios2/toolkit/sst/dp/rdma_dp.c | 81 ++++++++++++++------------ 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index c18addeab2..f2fea4ef94 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -663,44 +663,6 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, return; } - fabric->cq_manual_progress = NULL; - - if (info->domain_attr->data_progress == FI_PROGRESS_MANUAL) - { - Svcs->verbose( - CP_Stream, DPTraceVerbose, - "Using a separate thread to comply with the fabric's manual progress preference.\n"); - - struct cq_manual_progress *manual_progress = malloc(sizeof(struct cq_manual_progress)); - - manual_progress->cq_signal = fabric->cq_signal; - if (pthread_mutex_init(&manual_progress->cq_event_list_mutex, NULL) != 0) - { - Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not init mutex.\n"); - return; - } - manual_progress->cq_event_list = NULL; - manual_progress->cq_event_list_filled = 0; - manual_progress->Svcs = Svcs; - manual_progress->Stream = CP_Stream; - manual_progress->do_continue = 1; - pthread_cond_init(&manual_progress->cq_even_list_signal, NULL); - - fabric->cq_manual_progress = manual_progress; - - if (pthread_create(&fabric->pthread_id, NULL, &make_progress, fabric->cq_manual_progress) != - 0) - { - Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); - return; - } - } - else - { - Svcs->verbose(CP_Stream, DPTraceVerbose, - "Using the fabric's automatic progress capability.\n"); - } - fi_freeinfo(originfo); } @@ -1357,6 +1319,44 @@ static void RdmaWritePatternLocked(CP_Services Svcs, DP_RS_Stream Stream_v, long } } +static int init_progress_thread(FabricState fabric, CP_Services Svcs, void *CP_Stream) +{ + if (fabric->info->domain_attr->data_progress != FI_PROGRESS_MANUAL) + { + Svcs->verbose(CP_Stream, DPTraceVerbose, + "Using the fabric's automatic progress capability.\n"); + return EXIT_SUCCESS; + } + Svcs->verbose( + CP_Stream, DPTraceVerbose, + "Using a separate thread to comply with the fabric's manual progress preference.\n"); + + struct cq_manual_progress *manual_progress = malloc(sizeof(struct cq_manual_progress)); + + manual_progress->cq_signal = fabric->cq_signal; + if (pthread_mutex_init(&manual_progress->cq_event_list_mutex, NULL) != 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not init mutex.\n"); + return EXIT_FAILURE; + } + manual_progress->cq_event_list = NULL; + manual_progress->cq_event_list_filled = 0; + manual_progress->Svcs = Svcs; + manual_progress->Stream = CP_Stream; + manual_progress->do_continue = 1; + pthread_cond_init(&manual_progress->cq_even_list_signal, NULL); + + fabric->cq_manual_progress = manual_progress; + + if (pthread_create(&fabric->pthread_id, NULL, &make_progress, fabric->cq_manual_progress) != 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} + static DP_WS_Stream RdmaInitWriter(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params, attr_list DPAttrs, SstStats Stats) { @@ -1482,6 +1482,11 @@ static DP_WS_Stream RdmaInitWriter(CP_Services Svcs, void *CP_Stream, struct _Ss Svcs->verbose(CP_Stream, DPTraceVerbose, "Fabric Parameters:\n%s\n", fi_tostr(Fabric->info, FI_TYPE_INFO)); + if (init_progress_thread(Fabric, Svcs, CP_Stream) == EXIT_FAILURE) + { + goto err_out; + } + /* * save the CP_stream value of later use */ From 5b05c2e2600e4e4d362d9d6abb177d4a4a1faa55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Tue, 23 Jul 2024 18:05:46 +0200 Subject: [PATCH 39/50] More seamless batching for BP5 --- source/adios2/engine/sst/SstReader.cpp | 51 +++++++++++++++++++------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/source/adios2/engine/sst/SstReader.cpp b/source/adios2/engine/sst/SstReader.cpp index ee699da922..ed91519cd3 100644 --- a/source/adios2/engine/sst/SstReader.cpp +++ b/source/adios2/engine/sst/SstReader.cpp @@ -682,26 +682,47 @@ void SstReader::BP5PerformGets() size_t maxReadSize; auto ReadRequests = m_BP5Deserializer->GenerateReadRequests(true, &maxReadSize); std::vector sstReadHandlers; + std::vector nextSstReadHandlers; + sstReadHandlers.reserve(BATCH_SIZE); + nextSstReadHandlers.reserve(BATCH_SIZE); auto iterator = ReadRequests.cbegin(); auto end = ReadRequests.cend(); - while (iterator != end) - { - sstReadHandlers.clear(); - size_t counter = 0; - for (; counter < BATCH_SIZE && iterator != end; ++iterator, ++counter) + + auto enqueue_next = [&](std::vector& sstReadHandlers_lambda) { + if (iterator == end) { - auto const &Req = *iterator; + return false; + } + auto const &Req = *iterator; - void *dp_info = NULL; - if (m_CurrentStepMetaData->DP_TimestepInfo) - { - dp_info = m_CurrentStepMetaData->DP_TimestepInfo[Req.WriterRank]; - } - auto ret = SstReadRemoteMemory(m_Input, (int)Req.WriterRank, Req.Timestep, Req.StartOffset, - Req.ReadLength, Req.DestinationAddr, dp_info); - sstReadHandlers.push_back(ret); + void *dp_info = NULL; + if (m_CurrentStepMetaData->DP_TimestepInfo) + { + dp_info = m_CurrentStepMetaData->DP_TimestepInfo[Req.WriterRank]; + } + auto ret = SstReadRemoteMemory(m_Input, (int)Req.WriterRank, Req.Timestep, Req.StartOffset, + Req.ReadLength, Req.DestinationAddr, dp_info); + sstReadHandlers_lambda.push_back(ret); + ++iterator; + return true; + }; + + // Initiate request queue with first BATCH_SIZE requests + for (size_t i = 0; i < BATCH_SIZE; ++i) + { + if (!enqueue_next(sstReadHandlers)) + { + break; } + } + + // Drain current request queue + // For each fulfilled request, enqueue the next into the next queue + // poor man's asynchrony + while (!sstReadHandlers.empty()) + { + nextSstReadHandlers.clear(); for (const auto &i : sstReadHandlers) { if (SstWaitForCompletion(m_Input, i) != SstSuccess) @@ -709,7 +730,9 @@ void SstReader::BP5PerformGets() helper::Throw("Engine", "SstReader", "BP5PerformGets", "Writer failed before returning data"); } + enqueue_next(nextSstReadHandlers); } + sstReadHandlers.swap(nextSstReadHandlers); } m_BP5Deserializer->FinalizeGets(ReadRequests); From 1d37205f83bba5aa4f8d0312a3ba2817fa6ab0e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Tue, 23 Jul 2024 18:29:26 +0200 Subject: [PATCH 40/50] Same for BP --- source/adios2/engine/sst/SstReader.cpp | 29 +++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/source/adios2/engine/sst/SstReader.cpp b/source/adios2/engine/sst/SstReader.cpp index ed91519cd3..14ae298ac7 100644 --- a/source/adios2/engine/sst/SstReader.cpp +++ b/source/adios2/engine/sst/SstReader.cpp @@ -784,15 +784,32 @@ void SstReader::PerformGets() auto iterator = sstReadHandlers.cbegin(); auto end = sstReadHandlers.cend(); std::vector enqueuedHandlers; + std::vector nextEnqueuedHandlers; enqueuedHandlers.reserve(BATCH_SIZE); - while (iterator != end) + nextEnqueuedHandlers.reserve(BATCH_SIZE); + + auto enqueue_next = [&](std::vector &enqueuedHandlers) { + if (iterator == end) + { + return false; + } + enqueuedHandlers.push_back(performDeferredReadRemoteMemory(*iterator)); + ++iterator; + return true; + }; + + // Initiate request queue with first BATCH_SIZE requests + for (size_t i = 0; i < BATCH_SIZE; ++i) { - size_t counter = 0; - enqueuedHandlers.clear(); - for (; counter < BATCH_SIZE && iterator != end; ++iterator, ++counter) + if (!enqueue_next(enqueuedHandlers)) { - enqueuedHandlers.push_back(performDeferredReadRemoteMemory(*iterator)); + break; } + } + + while (!enqueuedHandlers.empty()) + { + nextEnqueuedHandlers.clear(); for (const auto &i : enqueuedHandlers) { if (SstWaitForCompletion(m_Input, i) != SstSuccess) @@ -800,7 +817,9 @@ void SstReader::PerformGets() helper::Throw("Engine", "SstReader", "PerformGets", "Writer failed before returning data"); } + enqueue_next(nextEnqueuedHandlers); } + enqueuedHandlers.swap(nextEnqueuedHandlers); } for (const std::string &name : m_BP3Deserializer->m_DeferredVariables) From 644915f7c7cb6d75db107c7f72189fa77c9d21cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Tue, 23 Jul 2024 19:25:08 +0200 Subject: [PATCH 41/50] Some comments --- source/adios2/engine/sst/SstReader.tcc | 1 - source/adios2/toolkit/sst/dp/rdma_dp.c | 32 ++++++++++++++++++++++---- source/adios2/toolkit/sst/dp/ucx_dp.c | 3 ++- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/source/adios2/engine/sst/SstReader.tcc b/source/adios2/engine/sst/SstReader.tcc index 0c864da160..6ffff2247a 100644 --- a/source/adios2/engine/sst/SstReader.tcc +++ b/source/adios2/engine/sst/SstReader.tcc @@ -15,7 +15,6 @@ #include "adios2/helper/adiosFunctions.h" //GetDataType #include -#include namespace adios2 { diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index f2fea4ef94..af9999cef3 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -123,26 +123,37 @@ int guard_fi_return(int code, CP_Services Svcs, CManager cm, char const *msg) return code; } +// Linked list of events that were retrieved by the progress thread +// and that can be requested by the main thread struct cq_event_list { struct fi_cq_data_entry *value; + // possibly null + // if not a single item is emplaced, then cq_manual_progress.cq_event_list is null struct cq_event_list *next; }; +// Parameters for make_progress(), launched as a separate threads +// to make manual progress in fabrics that require it struct cq_manual_progress { struct fid_cq *cq_signal; struct cq_event_list *cq_event_list; + // for thread-safe concurrent access (1 writer 1 reader) pthread_mutex_t cq_event_list_mutex; - pthread_cond_t cq_even_list_signal; + // are there any events currently in the list? char cq_event_list_filled; + // signal is sent when an item is enplaced + pthread_cond_t cq_even_list_signal; CP_Services Svcs; void *Stream; + // main thread sets this to 0 for telling the thread to come home again int do_continue; }; +// called by progress thread void cq_manual_progress_push(struct cq_manual_progress *self, struct cq_event_list *item) { pthread_mutex_lock(&self->cq_event_list_mutex); @@ -164,6 +175,8 @@ void cq_manual_progress_push(struct cq_manual_progress *self, struct cq_event_li pthread_cond_signal(&self->cq_even_list_signal); } +// called by main thread +// will block until data becomes available struct fi_cq_data_entry *cq_manual_progress_pop(struct cq_manual_progress *self) { struct fi_cq_data_entry *res; @@ -254,6 +267,9 @@ struct fabric_state pthread_t pthread_id; }; +// Wrapper for fi_cq_sread to be called in its stead from the main thread. +// If a progress thread is running, then we wait for data to become available there. +// Otherwise fi_cq_sread() is called synchronously. void cq_read(struct fabric_state *fabric, struct fi_cq_data_entry *CQEntry, CP_Services Svcs, void *Stream) { @@ -387,14 +403,13 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, #else fi_version = FI_VERSION(1, 5); - // Alternatively, one could set mr_mode to - // FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL - // here. These flags are equivalent to FI_MR_BASIC, but unlike basic + // These flags are the same as required by FI_MR_BASIC, but unlike basic // registration, providers are not forced to keep those flags when they // think that not using the flags is better. // The RDMA DP is able to deal with this appropriately, and does so right // before calling fi_fabric() further below in this function. - // The main reason for keeping FI_MR_BASIC here is backward compatibility. + // So, we specify these flags instead of FI_MR_BASIC in order to leave the + // decision up to the providers. hints->domain_attr->mr_mode = FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_LOCAL; hints->domain_attr->control_progress = FI_PROGRESS_AUTO; // data progress unspecified, both are fine @@ -673,6 +688,9 @@ static void fini_fabric(struct fabric_state *fabric, CP_Services Svcs, void *CP_ { fabric->cq_manual_progress->do_continue = 0; + // make_progress() is still cluelessly waiting for anything to happen + // before it gets the chance to check the do_continue flag. + // so we give it some event. fi_cq_signal(fabric->cq_signal); if (pthread_join(fabric->pthread_id, NULL) != 0) @@ -1319,6 +1337,10 @@ static void RdmaWritePatternLocked(CP_Services Svcs, DP_RS_Stream Stream_v, long } } +// This is currently called only by the writer thread right after init_fabric(). +// Could be called by the reader, too, in order to make progress in the background. +// But unlike for the writer, it's not necessary as the reader will +// make explicit synchronous progress upon requesting data. static int init_progress_thread(FabricState fabric, CP_Services Svcs, void *CP_Stream) { if (fabric->info->domain_attr->data_progress != FI_PROGRESS_MANUAL) diff --git a/source/adios2/toolkit/sst/dp/ucx_dp.c b/source/adios2/toolkit/sst/dp/ucx_dp.c index 43a6600590..f368909882 100644 --- a/source/adios2/toolkit/sst/dp/ucx_dp.c +++ b/source/adios2/toolkit/sst/dp/ucx_dp.c @@ -424,7 +424,8 @@ static void UcxProvideWriterDataToReader(CP_Services Svcs, DP_RS_Stream RS_Strea if (status != UCS_OK) { Svcs->verbose(RS_Stream->CP_Stream, DPCriticalVerbose, - "UCX Error during ucp_ep_create() with: %s. Let's ignore....\n", + "UCX Error during ucp_ep_create() with: %s. Let's ignore for now, this " + "point-to-point connection might not be needed.\n", ucs_status_string(status)); } Svcs->verbose(RS_Stream->CP_Stream, DPTraceVerbose, From 7efc224f6f6d53057437389445ff3bb65f91086d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Wed, 24 Jul 2024 12:20:56 +0200 Subject: [PATCH 42/50] Make this configurable via environment variable in libfabric --- source/adios2/toolkit/sst/dp/rdma_dp.c | 159 ++++++++++++++++++------- 1 file changed, 116 insertions(+), 43 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index af9999cef3..d5690eac75 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -1140,6 +1141,115 @@ static int get_cxi_auth_key_from_writer(struct cxi_auth_key *key, attr_list Writ } #endif +typedef enum +{ + ProgressThreadUnspecified, + ProgressThreadYes, + ProgressThreadNo +} ProgressThread; + +static ProgressThread use_progress_thread() +{ + size_t const max_len = 4; + char const *use_progress_thread_envvar = getenv("FABRIC_PROGRESS_THREAD"); + char use_progress_thread[max_len]; + + if (!use_progress_thread_envvar) + { + return ProgressThreadUnspecified; + } + + strncpy(use_progress_thread, use_progress_thread_envvar, max_len); + for (size_t i = 0; i < max_len; ++i) + { + use_progress_thread[i] = (char)tolower((int)use_progress_thread[i]); + } + + if (use_progress_thread_envvar && strncmp(use_progress_thread, "1", max_len) == 0 || + strncmp(use_progress_thread, "yes", max_len) == 0 || + strncmp(use_progress_thread, "on", max_len) == 0) + { + return ProgressThreadYes; + } + else + { + return ProgressThreadNo; + } +} + +// Called by writer as well as by the reader. +// For the writer, a separate progress thread is not needed as the reader will +// make explicit synchronous progress upon requesting data. +// In consequence, a progress thread is by default only launched on the writer +// side under the condition that the fabric indicates manual data progress. +// This behavior can be overridden using the environment variable `FABRIC_PROGRESS_THREAD`. +// Use cases for this: +// +// 1. Turn on a progress thread on the reader side as well for making progress +// asynchronously in the background. +// 2. The tcp provider claims that it supports automatic progress, but seems to hang up +// if a progress thread is not launched on the writer side. +// The env. var. can be used when the fabric behavior does not match its promises. +// 3. If for any reason the use of progress threads causes trouble, they can be turned +// off this way. +static int init_progress_thread(FabricState fabric, CP_Services Svcs, void *CP_Stream, + int is_reader) +{ + switch (use_progress_thread()) + { + case ProgressThreadUnspecified: + if (is_reader) + { + // Reader does not make manual progress by default. + // It will make synchronous progress anyway upon waiting for data. + return EXIT_SUCCESS; + } + if (fabric->info->domain_attr->data_progress != FI_PROGRESS_MANUAL) + { + Svcs->verbose(CP_Stream, DPTraceVerbose, + "Using the fabric's automatic progress capability.\n"); + return EXIT_SUCCESS; + } + Svcs->verbose( + CP_Stream, DPTraceVerbose, + "Using a separate thread to comply with the fabric's manual progress preference.\n"); + break; + case ProgressThreadYes: + Svcs->verbose(CP_Stream, DPTraceVerbose, + "Using a separate thread for manual progress upon user request.\n"); + break; + case ProgressThreadNo: + Svcs->verbose(CP_Stream, DPTraceVerbose, + "Not using a separate thread for manual progress upon user request.\n"); + return EXIT_SUCCESS; + } + + struct cq_manual_progress *manual_progress = malloc(sizeof(struct cq_manual_progress)); + + manual_progress->cq_signal = fabric->cq_signal; + if (pthread_mutex_init(&manual_progress->cq_event_list_mutex, NULL) != 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not init mutex.\n"); + return EXIT_FAILURE; + } + manual_progress->cq_event_list = NULL; + manual_progress->cq_event_list_filled = 0; + manual_progress->Svcs = Svcs; + manual_progress->Stream = CP_Stream; + manual_progress->do_continue = 1; + pthread_cond_init(&manual_progress->cq_even_list_signal, NULL); + + fabric->cq_manual_progress = manual_progress; + + if (pthread_create(&fabric->pthread_id, NULL, &make_progress, fabric->cq_manual_progress) != 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} + static DP_RS_Stream RdmaInitReader(CP_Services Svcs, void *CP_Stream, void **ReaderContactInfoPtr, struct _SstParams *Params, attr_list WriterContact, SstStats Stats) @@ -1259,6 +1369,11 @@ static DP_RS_Stream RdmaInitReader(CP_Services Svcs, void *CP_Stream, void **Rea return NULL; } + if (init_progress_thread(Fabric, Svcs, CP_Stream, /* is_reader = */ 1) == EXIT_FAILURE) + { + return NULL; + } + ContactInfo->Length = Fabric->info->src_addrlen; ContactInfo->Address = malloc(ContactInfo->Length); int error_code = fi_getname((fid_t)Fabric->signal, ContactInfo->Address, &ContactInfo->Length); @@ -1337,48 +1452,6 @@ static void RdmaWritePatternLocked(CP_Services Svcs, DP_RS_Stream Stream_v, long } } -// This is currently called only by the writer thread right after init_fabric(). -// Could be called by the reader, too, in order to make progress in the background. -// But unlike for the writer, it's not necessary as the reader will -// make explicit synchronous progress upon requesting data. -static int init_progress_thread(FabricState fabric, CP_Services Svcs, void *CP_Stream) -{ - if (fabric->info->domain_attr->data_progress != FI_PROGRESS_MANUAL) - { - Svcs->verbose(CP_Stream, DPTraceVerbose, - "Using the fabric's automatic progress capability.\n"); - return EXIT_SUCCESS; - } - Svcs->verbose( - CP_Stream, DPTraceVerbose, - "Using a separate thread to comply with the fabric's manual progress preference.\n"); - - struct cq_manual_progress *manual_progress = malloc(sizeof(struct cq_manual_progress)); - - manual_progress->cq_signal = fabric->cq_signal; - if (pthread_mutex_init(&manual_progress->cq_event_list_mutex, NULL) != 0) - { - Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not init mutex.\n"); - return EXIT_FAILURE; - } - manual_progress->cq_event_list = NULL; - manual_progress->cq_event_list_filled = 0; - manual_progress->Svcs = Svcs; - manual_progress->Stream = CP_Stream; - manual_progress->do_continue = 1; - pthread_cond_init(&manual_progress->cq_even_list_signal, NULL); - - fabric->cq_manual_progress = manual_progress; - - if (pthread_create(&fabric->pthread_id, NULL, &make_progress, fabric->cq_manual_progress) != 0) - { - Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); - return EXIT_FAILURE; - } - - return EXIT_SUCCESS; -} - static DP_WS_Stream RdmaInitWriter(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params, attr_list DPAttrs, SstStats Stats) { @@ -1504,7 +1577,7 @@ static DP_WS_Stream RdmaInitWriter(CP_Services Svcs, void *CP_Stream, struct _Ss Svcs->verbose(CP_Stream, DPTraceVerbose, "Fabric Parameters:\n%s\n", fi_tostr(Fabric->info, FI_TYPE_INFO)); - if (init_progress_thread(Fabric, Svcs, CP_Stream) == EXIT_FAILURE) + if (init_progress_thread(Fabric, Svcs, CP_Stream, /* is_reader = */ 0) == EXIT_FAILURE) { goto err_out; } From 3de00c0f29864a6465c0bf9cd650a65dfa98de79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Wed, 24 Jul 2024 15:39:25 +0200 Subject: [PATCH 43/50] Some cleanup in UCX --- source/adios2/toolkit/sst/dp/ucx_dp.c | 93 +++++++++++++++++++-------- 1 file changed, 65 insertions(+), 28 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/ucx_dp.c b/source/adios2/toolkit/sst/dp/ucx_dp.c index f368909882..b434d4a2f6 100644 --- a/source/adios2/toolkit/sst/dp/ucx_dp.c +++ b/source/adios2/toolkit/sst/dp/ucx_dp.c @@ -294,21 +294,59 @@ static void *make_progress(void *params_) while (params->keep_making_progress) { while (ucp_worker_progress(params->ucp_worker) != 0) - { // go again} + { // go again } + ucp_worker_wait(params->ucp_worker); } return NULL; } +typedef enum +{ + ProgressThreadUnspecified, + ProgressThreadYes, + ProgressThreadNo +} ProgressThread; + +/* + * `export UCX_POSIX_USE_PROC_LINK=n` might be necessary to make this work. + */ +static ProgressThread use_progress_thread() +{ + size_t const max_len = 4; + char const *use_progress_thread_envvar = getenv("UCX_PROGRESS_THREAD"); + char use_progress_thread[max_len]; + + if (!use_progress_thread_envvar) + { + return ProgressThreadUnspecified; + } + + strncpy(use_progress_thread, use_progress_thread_envvar, max_len); + for (size_t i = 0; i < max_len; ++i) + { + use_progress_thread[i] = (char)tolower((int)use_progress_thread[i]); + } + + if (use_progress_thread_envvar && strncmp(use_progress_thread, "1", max_len) == 0 || + strncmp(use_progress_thread, "yes", max_len) == 0 || + strncmp(use_progress_thread, "on", max_len) == 0) + { + return ProgressThreadYes; + } + else + { + return ProgressThreadNo; + } +} + + static DP_WS_Stream UcxInitWriter(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params, attr_list DPAttrs, SstStats Stats) { Ucx_WS_Stream Stream = malloc(sizeof(struct _Ucx_WS_Stream)); SMPI_Comm comm = Svcs->getMPIComm(CP_Stream); ucs_status_t status; - char const *use_progress_thread_envvar; - size_t const max_len = 4; - char use_progress_thread[max_len]; memset(Stream, 0, sizeof(struct _Ucx_WS_Stream)); @@ -324,35 +362,34 @@ static DP_WS_Stream UcxInitWriter(CP_Services Svcs, void *CP_Stream, struct _Sst Stream->CP_Stream = CP_Stream; - /* - * `export UCX_POSIX_USE_PROC_LINK=n` might be necessary to make this work. - */ - use_progress_thread_envvar = getenv("UCX_WRITER_PROGRESS_THREAD"); - if (use_progress_thread_envvar) - { - strncpy(use_progress_thread, use_progress_thread_envvar, max_len); - } - - for (size_t i = 0; i < max_len; ++i) - { - use_progress_thread[i] = (char)tolower((int)use_progress_thread[i]); - } - - if (use_progress_thread_envvar && strncmp(use_progress_thread, "1", max_len) == 0 || - strncmp(use_progress_thread, "yes", max_len) == 0 || - strncmp(use_progress_thread, "on", max_len) == 0) - { + switch (use_progress_thread()) + { + case ProgressThreadUnspecified: + // Since UCX does not allow asking the worker if it supports + // automatic progress, we can do no more here than make a generic + // decision for all users here. + // We consider manual progress as an optional feature that needs to be + // actively switched on. Treat this case same as ProgressThreadNo. + Svcs->verbose( + CP_Stream, DPTraceVerbose, + "Not using a separate thread for manual progress since it was not requested.\n"); + Stream->Fabric->keep_making_progress = 0; + break; + case ProgressThreadNo: + Svcs->verbose(CP_Stream, DPTraceVerbose, + "Not using a separate thread for manual progress upon user request.\n"); + Stream->Fabric->keep_making_progress = 0; + break; + case ProgressThreadYes: + Svcs->verbose(CP_Stream, DPTraceVerbose, + "Using a separate thread for manual progress upon user request.\n"); Stream->Fabric->keep_making_progress = 1; - if (pthread_create(&Stream->Fabric->progress_thread, NULL, &make_progress, - Stream->Fabric) != 0) + if (pthread_create(&Stream->Fabric->progress_thread, NULL, &make_progress, Stream->Fabric) != 0) { Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); return NULL; } - } - else - { - Stream->Fabric->keep_making_progress = 0; + break; } return (void *)Stream; From dc83736f81cfb960eba608201ffdcd9f643e139d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Thu, 25 Jul 2024 11:25:51 -0400 Subject: [PATCH 44/50] Some error resistance in CXI key retrieval --- source/adios2/toolkit/sst/dp/rdma_dp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index d5690eac75..c11a7bf396 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -991,6 +991,11 @@ static int get_cxi_auth_key_from_env(CP_Services Svcs, void *CP_Stream, struct _ char const *slingshot_devices = getenv("SLINGSHOT_DEVICES"); char const *preferred_device = get_preferred_domain(Params); + if ((!preferred_device && strncmp("cxi", preferred_device, 3) != 0) || !slingshot_devices) + { + return EXIT_FAILURE; + } + /* * In the following loop, find out if the preferred_device is found within * the slingshot_devices. From 5ee3ec71416c7a1e6ca2021cd745b49309faa7e3 Mon Sep 17 00:00:00 2001 From: Poeschel Date: Fri, 26 Jul 2024 17:29:54 +0200 Subject: [PATCH 45/50] Some final fixes --- source/adios2/toolkit/sst/dp/rdma_dp.c | 9 ++++++--- source/adios2/toolkit/sst/dp/ucx_dp.c | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index c11a7bf396..d5b6a6bcf8 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -78,16 +78,19 @@ int sst_fi_mr_reg( /* additional parameters for binding the mr to the endpoint*/ struct fid_ep *endpoint, int mr_mode) { + *mr = NULL; int res = fi_mr_reg(domain, buf, len, acs, offset, requested_key, flags, mr, context); int is_mr_endpoint = (mr_mode & FI_MR_ENDPOINT) != 0; if (!is_mr_endpoint) { return res; } - if (res != FI_SUCCESS) + if (res != FI_SUCCESS || !*mr) { - Svcs->verbose(CP_Stream, DPCriticalVerbose, "fi_mr_reg failed with %ul (%s)\n", res, - fi_strerror(res)); + Svcs->verbose(CP_Stream, DPCriticalVerbose, + "fi_mr_reg failed with %ul (%s). A possible cause is that some providers do " + "not support automated key provisioning, but ADIOS2 currently requires it.\n", + res, fi_strerror(res)); return res; } diff --git a/source/adios2/toolkit/sst/dp/ucx_dp.c b/source/adios2/toolkit/sst/dp/ucx_dp.c index b434d4a2f6..9da3a7ddb5 100644 --- a/source/adios2/toolkit/sst/dp/ucx_dp.c +++ b/source/adios2/toolkit/sst/dp/ucx_dp.c @@ -314,7 +314,7 @@ typedef enum static ProgressThread use_progress_thread() { size_t const max_len = 4; - char const *use_progress_thread_envvar = getenv("UCX_PROGRESS_THREAD"); + char const *use_progress_thread_envvar = getenv("SST_UCX_PROGRESS_THREAD"); char use_progress_thread[max_len]; if (!use_progress_thread_envvar) From c5ba305167b65c72b1704e24f2884e771862df08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 29 Jul 2024 12:40:20 +0200 Subject: [PATCH 46/50] Add an inline comment --- source/adios2/toolkit/sst/dp/rdma_dp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index d5b6a6bcf8..21543094f7 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -626,6 +626,9 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, } av_attr.type = FI_AV_MAP; + // The shm provider crashes in fi_cq_read() if specifying + // a count larger than 256 here. + // As this an optimization flag only, it seems safer to skip it here. if (strncmp(fabric->info->fabric_attr->prov_name, "shm", 4) != 0) { av_attr.count = DP_AV_DEF_SIZE; From 4b638247bb6fe9a1f69ecd9c3e25d25dd1c09f3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Mon, 29 Jul 2024 16:22:31 +0200 Subject: [PATCH 47/50] Enable manual progress without thread on reader side --- source/adios2/toolkit/sst/dp/rdma_dp.c | 128 +++++++++++++++++-------- 1 file changed, 90 insertions(+), 38 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 21543094f7..5de750abeb 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -199,6 +199,43 @@ struct fi_cq_data_entry *cq_manual_progress_pop(struct cq_manual_progress *self) return res; } +static void make_some_progress(struct cq_manual_progress *params, int timeout, + struct fi_cq_data_entry *CQEntries, size_t batch_size) +{ + struct fi_cq_data_entry data_entry; + if (!CQEntries || batch_size == 0) + { + // use stack-allocated "buffer" + CQEntries = &data_entry; + batch_size = 1; + } + ssize_t rc = fi_cq_sread(params->cq_signal, (void *)CQEntries, batch_size, NULL, timeout); + if (rc < 1) + { + struct fi_cq_err_entry error = {.err = 0}; + fi_cq_readerr(params->cq_signal, &error, 0); + if (error.err != -FI_SUCCESS) + { + params->Svcs->verbose( + params->Stream, DPCriticalVerbose, + "[PullSelection] no completion event (%d (%s - %s)).\n", rc, fi_strerror(error.err), + fi_cq_strerror(params->cq_signal, error.err, error.err_data, NULL, error.len)); + } + } + else + { + for (size_t i = 0; i < rc; ++i) + { + struct cq_event_list *next_item = malloc(sizeof(struct cq_event_list)); + struct fi_cq_data_entry *value = malloc(sizeof(struct fi_cq_data_entry)); + memcpy(value, &CQEntries[i], sizeof(struct fi_cq_data_entry)); + next_item->value = value; + next_item->next = NULL; + cq_manual_progress_push(params, next_item); + } + } +} + static void *make_progress(void *params_) { struct cq_manual_progress *params = (struct cq_manual_progress *)params_; @@ -212,32 +249,7 @@ static void *make_progress(void *params_) * fi_cq_sread(). Some providers don't make progress in a timely fashion otherwise (e.g. * shm). */ - ssize_t rc = fi_cq_sread(params->cq_signal, (void *)CQEntries, batch_size, NULL, -1); - if (rc < 1) - { - struct fi_cq_err_entry error = {.err = 0}; - fi_cq_readerr(params->cq_signal, &error, 0); - if (error.err != -FI_SUCCESS) - { - params->Svcs->verbose( - params->Stream, DPCriticalVerbose, - "[PullSelection] no completion event (%d (%s - %s)).\n", rc, - fi_strerror(error.err), - fi_cq_strerror(params->cq_signal, error.err, error.err_data, NULL, error.len)); - } - } - else - { - for (size_t i = 0; i < rc; ++i) - { - struct cq_event_list *next_item = malloc(sizeof(struct cq_event_list)); - struct fi_cq_data_entry * value = malloc(sizeof(struct fi_cq_data_entry)); - memcpy(value, &CQEntries[i], sizeof(struct fi_cq_data_entry)); - next_item->value = value; - next_item->next = NULL; - cq_manual_progress_push(params, next_item); - } - } + make_some_progress(params, -1, CQEntries, batch_size); } return NULL; } @@ -279,6 +291,22 @@ void cq_read(struct fabric_state *fabric, struct fi_cq_data_entry *CQEntry, CP_S { if (fabric->cq_manual_progress) { + if (fabric->pthread_id == 0) + { + // We're on the reader side. No progress thread was launched, but + // the fabric demands manual progress. + // We cannot directly call fi_cq_sread() now since it might have + // been called before at other places. There might be results + // in the queue. + if (!fabric->cq_manual_progress->cq_event_list_filled) + { + make_some_progress(fabric->cq_manual_progress, -1, NULL, 0); + if (!fabric->cq_manual_progress->cq_event_list_filled) + { + Svcs->verbose(Stream, DPCriticalVerbose, "[cq_read] no completion event."); + } + } + } struct fi_cq_data_entry *res = cq_manual_progress_pop(fabric->cq_manual_progress); memcpy(CQEntry, res, sizeof(struct fi_cq_data_entry)); free(res); @@ -293,9 +321,8 @@ void cq_read(struct fabric_state *fabric, struct fi_cq_data_entry *CQEntry, CP_S if (error.err != -FI_SUCCESS) { Svcs->verbose( - Stream, DPCriticalVerbose, - "[PullSelection] no completion event (%d (%s - %s)).\n", rc, - fi_strerror(error.err), + Stream, DPCriticalVerbose, "[cq_read] no completion event (%d (%s - %s)).\n", + rc, fi_strerror(error.err), fi_cq_strerror(fabric->cq_signal, error.err, error.err_data, NULL, error.len)); } } @@ -700,10 +727,13 @@ static void fini_fabric(struct fabric_state *fabric, CP_Services Svcs, void *CP_ // so we give it some event. fi_cq_signal(fabric->cq_signal); - if (pthread_join(fabric->pthread_id, NULL) != 0) + if(fabric->pthread_id != 0) { - Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not join thread.\n"); - return; + if (pthread_join(fabric->pthread_id, NULL) != 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not join thread.\n"); + return; + } } pthread_mutex_destroy(&fabric->cq_manual_progress->cq_event_list_mutex); @@ -1206,14 +1236,18 @@ static ProgressThread use_progress_thread() static int init_progress_thread(FabricState fabric, CP_Services Svcs, void *CP_Stream, int is_reader) { + int do_init_thread = 1; switch (use_progress_thread()) { case ProgressThreadUnspecified: if (is_reader) { - // Reader does not make manual progress by default. - // It will make synchronous progress anyway upon waiting for data. - return EXIT_SUCCESS; + // Reader does not need to launch a thread for making progress as it + // will naturally arrive at points where it can make progress + // synchronously. + // In this case, just initiate the progress queue (struct cq_manual_progress) + // so the reader knows to make progress at those points. + do_init_thread = 0; } if (fabric->info->domain_attr->data_progress != FI_PROGRESS_MANUAL) { @@ -1252,10 +1286,18 @@ static int init_progress_thread(FabricState fabric, CP_Services Svcs, void *CP_S fabric->cq_manual_progress = manual_progress; - if (pthread_create(&fabric->pthread_id, NULL, &make_progress, fabric->cq_manual_progress) != 0) + if (do_init_thread) { - Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); - return EXIT_FAILURE; + if (pthread_create(&fabric->pthread_id, NULL, &make_progress, fabric->cq_manual_progress) != + 0) + { + Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); + return EXIT_FAILURE; + } + } + else + { + fabric->pthread_id = 0; } return EXIT_SUCCESS; @@ -1851,6 +1893,16 @@ static ssize_t PostRead(CP_Services Svcs, Rdma_RS_Stream RS_Stream, int Rank, lo { rc = fi_read(Fabric->signal, Buffer, Length, LocalDesc, SrcAddress, (uint64_t)Addr, Info->Key, ret); + if (Fabric->cq_manual_progress && Fabric->pthread_id == 0) + { + /* + * Cannot make a blocking call here since maybe the fi_read() task + * above did not register, so there is nothing to wait for. + * Need to specify either a timeout or call this non-blockingly to + * ensure that this returns. + */ + make_some_progress(Fabric->cq_manual_progress, 0, NULL, 0); + } } while (rc == -EAGAIN); if (rc != 0) From 6498e1ed93a94c5c82577b48fa1630b6afff23ca Mon Sep 17 00:00:00 2001 From: Greg Eisenhauer Date: Sun, 11 Aug 2024 10:25:10 -0400 Subject: [PATCH 48/50] Format --- source/adios2/engine/sst/SstReader.cpp | 2 +- source/adios2/engine/sst/SstReader.h | 2 +- source/adios2/toolkit/sst/dp/rdma_dp.c | 6 +++--- source/adios2/toolkit/sst/dp/ucx_dp.c | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/source/adios2/engine/sst/SstReader.cpp b/source/adios2/engine/sst/SstReader.cpp index 14ae298ac7..33a1d119db 100644 --- a/source/adios2/engine/sst/SstReader.cpp +++ b/source/adios2/engine/sst/SstReader.cpp @@ -689,7 +689,7 @@ void SstReader::BP5PerformGets() auto iterator = ReadRequests.cbegin(); auto end = ReadRequests.cend(); - auto enqueue_next = [&](std::vector& sstReadHandlers_lambda) { + auto enqueue_next = [&](std::vector &sstReadHandlers_lambda) { if (iterator == end) { return false; diff --git a/source/adios2/engine/sst/SstReader.h b/source/adios2/engine/sst/SstReader.h index a0b5160ed7..64cfef6772 100644 --- a/source/adios2/engine/sst/SstReader.h +++ b/source/adios2/engine/sst/SstReader.h @@ -62,7 +62,7 @@ class SstReader : public Engine char *buffer; void *dp_info; }; - void * performDeferredReadRemoteMemory(DeferredReadRemoteMemory const &); + void *performDeferredReadRemoteMemory(DeferredReadRemoteMemory const &); template void ReadVariableBlocksRequests(Variable &variable, diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 5de750abeb..380493b3e9 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -462,8 +462,8 @@ static void init_fabric(struct fabric_state *fabric, struct _SstParams *Params, fabric->info = NULL; - char const * provider_name = NULL; - if((provider_name = getenv("FABRIC_PROVIDER"))) + char const *provider_name = NULL; + if ((provider_name = getenv("FABRIC_PROVIDER"))) { size_t len = strlen(provider_name); hints->fabric_attr->prov_name = malloc(len + 1); @@ -727,7 +727,7 @@ static void fini_fabric(struct fabric_state *fabric, CP_Services Svcs, void *CP_ // so we give it some event. fi_cq_signal(fabric->cq_signal); - if(fabric->pthread_id != 0) + if (fabric->pthread_id != 0) { if (pthread_join(fabric->pthread_id, NULL) != 0) { diff --git a/source/adios2/toolkit/sst/dp/ucx_dp.c b/source/adios2/toolkit/sst/dp/ucx_dp.c index 9da3a7ddb5..afa41f046c 100644 --- a/source/adios2/toolkit/sst/dp/ucx_dp.c +++ b/source/adios2/toolkit/sst/dp/ucx_dp.c @@ -340,7 +340,6 @@ static ProgressThread use_progress_thread() } } - static DP_WS_Stream UcxInitWriter(CP_Services Svcs, void *CP_Stream, struct _SstParams *Params, attr_list DPAttrs, SstStats Stats) { @@ -384,7 +383,8 @@ static DP_WS_Stream UcxInitWriter(CP_Services Svcs, void *CP_Stream, struct _Sst Svcs->verbose(CP_Stream, DPTraceVerbose, "Using a separate thread for manual progress upon user request.\n"); Stream->Fabric->keep_making_progress = 1; - if (pthread_create(&Stream->Fabric->progress_thread, NULL, &make_progress, Stream->Fabric) != 0) + if (pthread_create(&Stream->Fabric->progress_thread, NULL, &make_progress, + Stream->Fabric) != 0) { Svcs->verbose(CP_Stream, DPCriticalVerbose, "Could not start thread.\n"); return NULL; @@ -791,7 +791,7 @@ static void UcxDestroyWriter(CP_Services Svcs, DP_WS_Stream WS_Stream_v) Svcs->verbose(WS_Stream->CP_Stream, DPTraceVerbose, "Tearing down RDMA state on writer.\n"); - if(WS_Stream->Fabric->keep_making_progress == 1) + if (WS_Stream->Fabric->keep_making_progress == 1) { WS_Stream->Fabric->keep_making_progress = 0; ucp_worker_signal(WS_Stream->Fabric->ucp_worker); From 7f17c0c16c43f849416427c6560f519492670464 Mon Sep 17 00:00:00 2001 From: Greg Eisenhauer Date: Sun, 11 Aug 2024 16:01:24 -0400 Subject: [PATCH 49/50] warning --- source/adios2/toolkit/sst/dp/rdma_dp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 380493b3e9..6b8accbe53 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -1206,9 +1206,9 @@ static ProgressThread use_progress_thread() use_progress_thread[i] = (char)tolower((int)use_progress_thread[i]); } - if (use_progress_thread_envvar && strncmp(use_progress_thread, "1", max_len) == 0 || - strncmp(use_progress_thread, "yes", max_len) == 0 || - strncmp(use_progress_thread, "on", max_len) == 0) + if (use_progress_thread_envvar && ((strncmp(use_progress_thread, "1", max_len) == 0) || + (strncmp(use_progress_thread, "yes", max_len) == 0) || + (strncmp(use_progress_thread, "on", max_len) == 0))) { return ProgressThreadYes; } From ba7d6c151c11ca193374103d54c984fda6fe5c14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20P=C3=B6schel?= Date: Tue, 6 Aug 2024 10:20:50 -0400 Subject: [PATCH 50/50] Fix env var handling for CXI key retrieval --- source/adios2/toolkit/sst/dp/rdma_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adios2/toolkit/sst/dp/rdma_dp.c b/source/adios2/toolkit/sst/dp/rdma_dp.c index 6b8accbe53..11eb1cabd7 100644 --- a/source/adios2/toolkit/sst/dp/rdma_dp.c +++ b/source/adios2/toolkit/sst/dp/rdma_dp.c @@ -1027,7 +1027,7 @@ static int get_cxi_auth_key_from_env(CP_Services Svcs, void *CP_Stream, struct _ char const *slingshot_devices = getenv("SLINGSHOT_DEVICES"); char const *preferred_device = get_preferred_domain(Params); - if ((!preferred_device && strncmp("cxi", preferred_device, 3) != 0) || !slingshot_devices) + if (!preferred_device || strncmp("cxi", preferred_device, 3) != 0 || !slingshot_devices) { return EXIT_FAILURE; }