Skip to content

Commit

Permalink
fixed error handling of MPI errors on transfer of parameter data to p…
Browse files Browse the repository at this point in the history
…io_msg
  • Loading branch information
edhartnett committed Jun 1, 2016
1 parent 751adee commit 5d1dba0
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 70 deletions.
5 changes: 3 additions & 2 deletions src/clib/pio_get_nc_async.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,10 @@ int PIOc_get_vars_tc(int ncid, int varid, const PIO_Offset *start, const PIO_Off
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
return check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);

/* Broadcast values currently only known on computation tasks to IO tasks. */
if ((mpierr = MPI_Bcast(&num_elem, 1, MPI_OFFSET, ios->comproot, ios->my_comm)))
Expand Down
20 changes: 10 additions & 10 deletions src/clib/pio_msg.c
Original file line number Diff line number Diff line change
Expand Up @@ -895,9 +895,7 @@ int inq_var_handler(iosystem_desc_t *ios)
int ndims, dimids[NC_MAX_DIMS], natts;
int ret;

int my_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
LOG((1, "%d inq_var_handler\n", my_rank));
LOG((1, "inq_var_handler"));

/* Get the parameters for this function that the the comp master
* task is broadcasting. */
Expand All @@ -915,9 +913,9 @@ int inq_var_handler(iosystem_desc_t *ios)
return PIO_EIO;
if ((mpierr = MPI_Bcast(&natts_present, 1, MPI_CHAR, 0, ios->intercomm)))
return PIO_EIO;
printf("%d inq_var_handler ncid = %d varid = %d name_present = %d xtype_present = %d ndims_present = %d "
"dimids_present = %d natts_present = %d\n",
my_rank, ncid, varid, name_present, xtype_present, ndims_present, dimids_present, natts_present);
LOG((2,"inq_var_handler ncid = %d varid = %d name_present = %d xtype_present = %d ndims_present = %d "
"dimids_present = %d natts_present = %d\n",
ncid, varid, name_present, xtype_present, ndims_present, dimids_present, natts_present));

/* Set the non-NULL pointers. */
if (name_present)
Expand All @@ -935,6 +933,9 @@ int inq_var_handler(iosystem_desc_t *ios)
if ((ret = PIOc_inq_var(ncid, varid, namep, xtypep, ndimsp, dimidsp, nattsp)))
return ret;

if (ndims_present)
LOG((2, "inq_var_handler ndims = %d", ndims));

return PIO_NOERR;
}

Expand Down Expand Up @@ -985,15 +986,13 @@ int sync_file_handler(iosystem_desc_t *ios)
int mpierr;
int ret;

int my_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
LOG((1, "%d sync_file_handler\n", my_rank));
LOG((1, "sync_file_handler"));

/* Get the parameters for this function that the comp master
* task is broadcasting. */
if ((mpierr = MPI_Bcast(&ncid, 1, MPI_INT, 0, ios->intercomm)))
return PIO_EIO;
LOG((1, "%d sync_file_handler got parameter ncid = %d\n", my_rank, ncid));
LOG((1, "sync_file_handler got parameter ncid = %d", ncid));

/* Call the sync file function. */
if ((ret = PIOc_sync(ncid)))
Expand Down Expand Up @@ -1786,6 +1785,7 @@ int PIOc_Init_Intercomm(int component_count, MPI_Comm peer_comm,
int iam;
int io_leader, comp_leader;
int root;
MPI_Group io_grp, comm_grp, union_grp;

MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

Expand Down
123 changes: 74 additions & 49 deletions src/clib/pio_nc_async.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,10 @@ int PIOc_inq(int ncid, int *ndimsp, int *nvarsp, int *ngattsp,
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
return check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
return check_mpi(file, mpierr2, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -257,9 +258,10 @@ int PIOc_inq_type(int ncid, nc_type xtype, char *name, PIO_Offset *sizep)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
return check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -339,9 +341,10 @@ int PIOc_inq_format (int ncid, int *formatp)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
return check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
return check_mpi(file, mpierr2, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -427,9 +430,10 @@ int PIOc_inq_dim(int ncid, int dimid, char *name, PIO_Offset *lenp)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
return check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -545,9 +549,10 @@ int PIOc_inq_dimid(int ncid, const char *name, int *idp)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
return check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* IO tasks call the netCDF functions. */
Expand Down Expand Up @@ -644,9 +649,10 @@ int PIOc_inq_var(int ncid, int varid, char *name, nc_type *xtypep, int *ndimsp,
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
return check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
return check_mpi(file, mpierr2, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* Call the netCDF layer. */
Expand All @@ -670,6 +676,9 @@ int PIOc_inq_var(int ncid, int varid, char *name, nc_type *xtypep, int *ndimsp,
#endif /* _NETCDF */
}

if (ndimsp)
LOG((2, "PIOc_inq_var ndims = %d", *ndimsp));

/* Broadcast and check the return code. */
if ((mpierr = MPI_Bcast(&ierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
return check_mpi(file, mpierr, __FILE__, __LINE__);
Expand Down Expand Up @@ -697,6 +706,7 @@ int PIOc_inq_var(int ncid, int varid, char *name, nc_type *xtypep, int *ndimsp,
if ((mpierr = MPI_Bcast(ndimsp, 1, MPI_INT, ios->ioroot, ios->my_comm)))
return check_mpi(file, mpierr, __FILE__, __LINE__);
file->varlist[varid].ndims = (*ndimsp);
LOG((2, "PIOc_inq_var Bcast ndims = %d", *ndimsp));
}
if (dimidsp)
{
Expand Down Expand Up @@ -811,9 +821,10 @@ int PIOc_inq_varid (int ncid, const char *name, int *varidp)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -909,9 +920,10 @@ int PIOc_inq_att(int ncid, int varid, const char *name, nc_type *xtypep,
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -1020,9 +1032,10 @@ int PIOc_inq_attname(int ncid, int varid, int attnum, char *name)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -1120,9 +1133,10 @@ int PIOc_inq_attid(int ncid, int varid, const char *name, int *idp)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -1213,9 +1227,10 @@ int PIOc_rename_dim(int ncid, int dimid, const char *name)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}


Expand Down Expand Up @@ -1300,9 +1315,10 @@ int PIOc_rename_var(int ncid, int varid, const char *name)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}


Expand Down Expand Up @@ -1394,9 +1410,10 @@ int PIOc_rename_att (int ncid, int varid, const char *name,
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -1478,9 +1495,10 @@ int PIOc_del_att(int ncid, int varid, const char *name)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -1551,9 +1569,10 @@ int PIOc_set_fill (int ncid, int fillmode, int *old_modep)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}


Expand Down Expand Up @@ -1616,9 +1635,10 @@ int pioc_change_def(int ncid, int is_enddef)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -1743,9 +1763,10 @@ int PIOc_def_dim (int ncid, const char *name, PIO_Offset len, int *idp)


/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -1843,9 +1864,10 @@ int PIOc_def_var (int ncid, const char *name, nc_type xtype, int ndims,
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -1930,9 +1952,10 @@ int PIOc_inq_var_fill(int ncid, int varid, int *no_fill, void *fill_valuep)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);
}

/* If this is an IO task, then call the netCDF function. */
Expand Down Expand Up @@ -2052,9 +2075,10 @@ int PIOc_get_att(int ncid, int varid, const char *name, void *ip)
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);

/* Broadcast values currently only known on computation tasks to IO tasks. */
LOG((2, "PIOc_get_att bcast from comproot = %d attlen = %d typelen = %d", ios->comproot, attlen, typelen));
Expand Down Expand Up @@ -2175,9 +2199,10 @@ int PIOc_put_att(int ncid, int varid, const char *name, nc_type xtype,
}

/* Handle MPI errors. */
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
check_mpi(file, mpierr2, __FILE__, __LINE__);
check_mpi(file, mpierr, __FILE__, __LINE__);
if (mpierr)
return check_mpi(file, mpierr, __FILE__, __LINE__);

/* Broadcast values currently only known on computation tasks to IO tasks. */
LOG((2, "PIOc_put_att bcast from comproot = %d typelen = %d", ios->comproot, typelen));
Expand Down
13 changes: 4 additions & 9 deletions src/clib/pio_put_nc_async.c
Original file line number Diff line number Diff line change
Expand Up @@ -156,18 +156,13 @@ int PIOc_put_vars_tc(int ncid, int varid, const PIO_Offset *start, const PIO_Off
}

/* Handle MPI errors. */
LOG((2, "checking mpierr = %d", mpierr));
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->ioroot, ios->my_comm)))
LOG((2, "PIOc_put_vars_tc bcasting mpierr = %d", mpierr));
if ((mpierr2 = MPI_Bcast(&mpierr, 1, MPI_INT, ios->comproot, ios->my_comm)))
return check_mpi(file, mpierr2, __FILE__, __LINE__);
LOG((2, "PIOc_put_vars_tc checking mpierr = %d", mpierr));
if (mpierr)
check_mpi(file, mpierr, __FILE__, __LINE__);
LOG((2, "checked mpierr = %d", mpierr));

/* /\* Broadcast values currently only known on computation tasks to IO tasks. *\/ */
/* if ((mpierr = MPI_Bcast(&ndims, 1, MPI_INT, ios->comproot, ios->my_comm))) */
/* check_mpi(file, mpierr, __FILE__, __LINE__); */
/* if ((mpierr = MPI_Bcast(&typelen, 1, MPI_OFFSET, ios->comproot, ios->my_comm))) */
/* check_mpi(file, mpierr, __FILE__, __LINE__); */
LOG((2, "PIOc_put_vars_tc checked mpierr = %d", mpierr));
}

/* If this is an IO task, then call the netCDF function. */
Expand Down

0 comments on commit 5d1dba0

Please sign in to comment.