From 76bb48b98684e2836176bebc2bad104eff3be36d Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Tue, 25 Jun 2024 15:42:15 -0700 Subject: [PATCH 1/2] fabtests/psm3: Disable fi_rdm_tagged_peek for cleanup failure fi_rdm_tagged_peek fails to cleanup with "munmap_chunk(): invalid pointer" when trying to free hfi_nids in psm_ep.c:1161. This test is successful when FI_PROVIDER is unset and fails when it is set to "psm3" or "PSM3". There is an open issue in ofiwg/libfabric to track this bug. When it is resolved we can re-enable this test. Issue opened: 10123 Signed-off-by: Zach Dworkin --- fabtests/test_configs/psm3/psm3.exclude | 1 + 1 file changed, 1 insertion(+) diff --git a/fabtests/test_configs/psm3/psm3.exclude b/fabtests/test_configs/psm3/psm3.exclude index b02ce740b93..418ba8a1b5d 100644 --- a/fabtests/test_configs/psm3/psm3.exclude +++ b/fabtests/test_configs/psm3/psm3.exclude @@ -17,3 +17,4 @@ rdm_cntr_pingpong multi_recv dgram_waitset multinode +rdm_tagged_peek From 375d2ca3420ee519d298412d0bd5add540864c6e Mon Sep 17 00:00:00 2001 From: Zach Dworkin Date: Tue, 25 Jun 2024 16:45:57 -0700 Subject: [PATCH 2/2] fabtests/ucx: disable fi_rdm_tagged_peek ucx test fi_rdm_tagged_peek is failing on cleanup path. ft_free_res() -> ft_close_fids() -> fi_close() -> ucx_ep_close() -> ucp_worker_destroy() -> ucp_worker_discard_uct_ep_progress() -> ucp_ep_destroy_base() -> __funlockfile() The reported error is: "Segmentation fault: address not mapped to object at address 0x8" This is a race condition and does not occur every time. To reproduce run: server: fi_rdm_tagged_peek -p ucx -E client: fi_rdm_tagged_peek -p ucx -E server_address Issue 10126 is tracking this bug. Re-enable this test when it is resolved. Signed-off-by: Zach Dworkin --- fabtests/test_configs/ucx/ucx.exclude | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fabtests/test_configs/ucx/ucx.exclude b/fabtests/test_configs/ucx/ucx.exclude index 4bb3608c64b..fd557fb1678 100644 --- a/fabtests/test_configs/ucx/ucx.exclude +++ b/fabtests/test_configs/ucx/ucx.exclude @@ -43,3 +43,6 @@ writedata rdm_atomic # FI_INJECT_COMPLETE not supported -A inj_complete + +# Fails as a race condition because of segfault +rdm_tagged_peek \ No newline at end of file