Skip to content

Commit

Permalink
ch4: add MPIR_CVAR_CH4_DEBUG_PROGRESS_TIMEOUT
Browse files Browse the repository at this point in the history
When MPICH_DEBUG_REQUEST is defined and
MPIR_CVAR_CH4_DEBUG_PROGRESS_TIMEOUT is set to non-zero, we add debug
dump of outstanding request infomation when progress-loop is not making
progress for a while. This is helpful in debugging TIMEOUT errors.
  • Loading branch information
hzhou committed May 15, 2021
1 parent 3fc146b commit 3350ea4
Showing 1 changed file with 47 additions and 0 deletions.
47 changes: 47 additions & 0 deletions src/mpid/ch4/src/ch4_progress.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,24 @@

#include "ch4_impl.h"


/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
cvars:
- name : MPIR_CVAR_CH4_DEBUG_PROGRESS_TIMEOUT
category : CH4
type : int
default : 0
class : none
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_LOCAL
description : >-
Sets the timeout in seconds to dump outstanding requests when progress wait is not making progress for some time.
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

/* Global progress (polling every vci) is required for correctness. Currently we adopt the
* simple approach to do global progress every MPIDI_CH4_PROG_POLL_MASK.
*
Expand Down Expand Up @@ -291,6 +309,33 @@ MPL_STATIC_INLINE_PREFIX int MPID_Progress_poke(void)
#define MPIDI_PROGRESS_YIELD() MPL_thread_yield()
#endif

#ifdef MPICH_DEBUG_REQUEST
#define PROGRESS_START \
int iter = 0; \
MPL_time_t time_start; \
if (MPIR_CVAR_CH4_DEBUG_PROGRESS_TIMEOUT > 0) { \
MPL_wtime(&time_start); \
}

#define PROGRESS_CHECK \
if (MPIR_CVAR_CH4_DEBUG_PROGRESS_TIMEOUT > 0) { \
iter++; \
if (iter == 0xffff) {\
double time_diff = 0.0; \
MPL_time_t time_cur; \
MPL_wtime(&time_cur); \
MPL_wtime_diff(&time_start, &time_cur, &time_diff); \
if (time_diff > MPIR_CVAR_CH4_DEBUG_PROGRESS_TIMEOUT) { \
MPIR_Request_debug(); \
} \
} \
}

#else
#define PROGRESS_START do {} while (0)
#define PROGRESS_CHECK do {} while (0)
#endif

MPL_STATIC_INLINE_PREFIX int MPID_Progress_wait(MPID_Progress_state * state)
{
int mpi_errno = MPI_SUCCESS;
Expand All @@ -305,12 +350,14 @@ MPL_STATIC_INLINE_PREFIX int MPID_Progress_wait(MPID_Progress_state * state)

#else
state->progress_made = 0;
PROGRESS_START;
while (1) {
mpi_errno = MPIDI_progress_test(state, 1);
MPIR_ERR_CHECK(mpi_errno);
if (state->progress_made) {
break;
}
PROGRESS_CHECK;
MPIDI_PROGRESS_YIELD();
}

Expand Down

0 comments on commit 3350ea4

Please sign in to comment.