Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix treematch topology assert #4644

Merged
merged 2 commits into from
Jan 3, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 108 additions & 85 deletions ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

#include "opal/mca/pmix/pmix.h"

/* #define __DEBUG__ 1 */
/* #define __DEBUG__ 1 */

/**
* This function is a allreduce between all processes to detect for oversubscription.
Expand Down Expand Up @@ -72,23 +72,25 @@ static int check_oversubscribing(int rank,
}

#ifdef __DEBUG__
static void dump_int_array( char* prolog, char* line_prolog, int* array, size_t length )
static void dump_int_array( int level, int output_id, char* prolog, char* line_prolog, int* array, size_t length )
{
size_t i;
if( -1 == output_id ) return;

fprintf(stdout,"%s : ", prolog);
opal_output_verbose(level, output_id, "%s : ", prolog);
for(i = 0; i < length ; i++)
fprintf(stdout,"%s [%lu:%i] ", line_prolog, i, array[i]);
fprintf(stdout,"\n");
opal_output_verbose(level, output_id, "%s [%lu:%i] ", line_prolog, i, array[i]);
opal_output_verbose(level, output_id, "\n");
}
static void dump_double_array( char* prolog, char* line_prolog, double* array, size_t length )
static void dump_double_array( int level, int output_id, char* prolog, char* line_prolog, double* array, size_t length )
{
size_t i;

fprintf(stdout,"%s : ", prolog);
if( -1 == output_id ) return;
opal_output_verbose(level, output_id, "%s : ", prolog);
for(i = 0; i < length ; i++)
fprintf(stdout,"%s [%lu:%lf] ", line_prolog, i, array[i]);
fprintf(stdout,"\n");
opal_output_verbose(level, output_id, "%s [%lu:%lf] ", line_prolog, i, array[i]);
opal_output_verbose(level, output_id, "\n");
}
#endif

Expand Down Expand Up @@ -152,9 +154,8 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
rank = ompi_comm_rank(comm_old);
size = ompi_comm_size(comm_old);

#ifdef __DEBUG__
fprintf(stdout,"Process rank is : %i\n",rank);
#endif
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"Process rank is : %i\n",rank));
/**
* In order to decrease the number of loops let's use a trick:
* build the lindex_to_grank in the vpids array, and only allocate
Expand Down Expand Up @@ -184,8 +185,10 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,

#ifdef __DEBUG__
if ( 0 == rank ) {
dump_int_array("lindex_to_grank : ", "", lindex_to_grank, num_procs_in_node);
dump_int_array("Vpids : ", "", colors, size);
dump_int_array(10, ompi_topo_base_framework.framework_output,
"lindex_to_grank : ", "", lindex_to_grank, num_procs_in_node);
dump_int_array(10, ompi_topo_base_framework.framework_output,
"Vpids : ", "", colors, size);
}
#endif
/* clean-up dupes in the array */
Expand All @@ -210,9 +213,11 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
for(i = idx = 0; i < size; i++)
if( vpids[i] != -1 )
nodes_roots[idx++] = i;
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"num nodes is %i\n", num_nodes));
#ifdef __DEBUG__
fprintf(stdout, "num nodes is %i\n", num_nodes);
dump_int_array("Root nodes are :\n", "root ", nodes_roots, num_nodes);
dump_int_array(10, ompi_topo_base_framework.framework_output,
"Root nodes are :\n", "root ", nodes_roots, num_nodes);
#endif
}
free(vpids);
Expand All @@ -229,10 +234,9 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
*/

if(hwloc_bitmap_isincluded(root_obj->cpuset,set)) { /* processes are not bound on the machine */
#ifdef __DEBUG__
if (0 == rank)
fprintf(stdout,">>>>>>>>>>>>> Process Not bound <<<<<<<<<<<<<<<\n");
#endif /* __DEBUG__ */
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
">>>>>>>>>>>>> Process Not bound <<<<<<<<<<<<<<<\n"));

/* we try to bind to cores or above objects if enough are present */
/* Not sure that cores are present in ALL nodes */
Expand All @@ -255,9 +259,9 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,

if(oversubscribing_objs) {
if(hwloc_bitmap_isincluded(root_obj->cpuset, set)) { /* processes are not bound on the machine */
#ifdef __DEBUG__
fprintf(stdout,"Oversubscribing OBJ/CORES resources => Trying to use PUs \n");
#endif
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"Oversubscribing OBJ/CORES resources => Trying to use PUs \n"));

oversubscribed_pus = check_oversubscribing(rank, num_nodes,
num_pus_in_node, num_procs_in_node,
nodes_roots, lindex_to_grank, comm_old);
Expand All @@ -266,9 +270,8 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
obj_rank = ompi_process_info.my_local_rank%num_pus_in_node;
effective_depth = hwloc_topology_get_depth(opal_hwloc_topology) - 1;
num_objs_in_node = num_pus_in_node;
#ifdef __DEBUG__
fprintf(stdout, "Process not bound : binding on PU#%i \n", obj_rank);
#endif
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"Process %i not bound : binding on PU#%i \n", rank, obj_rank));
}
} else {
/* Bound processes will participate with the same data as before */
Expand All @@ -293,23 +296,24 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
hwloc_bitmap_singlify(set); /* we don't want the process to move */
hwloc_err = hwloc_set_cpubind(opal_hwloc_topology, set, 0);
if( -1 == hwloc_err) {
free(colors);
hwloc_bitmap_free(set);
goto fallback; /* return with success */
}
#ifdef __DEBUG__
fprintf(stdout,"Process not bound : binding on OBJ#%i \n",obj_rank);
#endif
/* This is a local issue. Either we agree with the rest of the processes to stop the
* reordering or we have to complete the entire process. Let's complete.
*/
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"Process %i failed to bind on OBJ#%i \n", rank, obj_rank));
} else
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"Process %i not bound : binding on OBJ#%i \n",rank, obj_rank));
} else {
#ifdef __DEBUG__
fprintf(stdout, "Process %i bound on OBJ #%i \n", rank, obj_rank);
fprintf(stdout, "=====> Num obj in node : %i | num pus in node : %i\n", num_objs_in_node, num_pus_in_node);
#endif
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"Process %i bound on OBJ #%i \n"
"=====> Num obj in node : %i | num pus in node : %i\n",
rank, obj_rank,
num_objs_in_node, num_pus_in_node));
}
} else {
#ifdef __DEBUG__
fprintf(stdout, "Oversubscribing PUs resources => Rank Reordering Impossible \n");
#endif
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"Oversubscribing PUs resources => Rank Reordering Impossible \n"));
free(colors);
hwloc_bitmap_free(set);
goto fallback; /* return with success */
Expand All @@ -324,9 +328,8 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
myhierarchy[0] = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, 0);
for (i = 1; i < array_size ; i++) {
myhierarchy[i] = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, i);
#ifdef __DEBUG__
fprintf(stdout,"hierarchy[%i] = %i\n", i, myhierarchy[i]);
#endif
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"hierarchy[%i] = %i\n", i, myhierarchy[i]));
if ((myhierarchy[i] != 0) && (myhierarchy[i] != myhierarchy[i-1]))
numlevels++;
}
Expand All @@ -339,12 +342,14 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
tracker[idx] = hwloc_get_obj_by_depth(opal_hwloc_topology, effective_depth, 0);
free(myhierarchy);

#ifdef __DEBUG__
fprintf(stdout, ">>>>>>>>>>>>>>>>>>>>> Effective depth is : %i (total depth %i)| num_levels %i\n",
effective_depth, hwloc_topology_get_depth(opal_hwloc_topology), numlevels);
for(i = 0 ; i < numlevels ; i++)
fprintf(stdout, "tracker[%i] : arity %i | depth %i\n", i, tracker[i]->arity, tracker[i]->depth);
#endif
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
">>>>>>>>>>>>>>>>>>>>> Effective depth is : %i (total depth %i)| num_levels %i\n",
effective_depth, hwloc_topology_get_depth(opal_hwloc_topology), numlevels));
for(i = 0 ; i < numlevels ; i++) {
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"tracker[%i] : arity %i | depth %i\n",
i, tracker[i]->arity, tracker[i]->depth));
}
/* get the obj number */
localrank_to_objnum = (int *)calloc(num_procs_in_node, sizeof(int));
localrank_to_objnum[0] = obj_rank;
Expand Down Expand Up @@ -383,9 +388,8 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
*/
if( 0 == rank ) {

#ifdef __DEBUG__
fprintf(stderr,"========== Centralized Reordering ========= \n");
#endif
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"========== Centralized Reordering ========= \n"));
local_pattern = (double *)calloc(size*size,sizeof(double));
} else {
local_pattern = (double *)calloc(size,sizeof(double));
Expand Down Expand Up @@ -474,7 +478,8 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
memcpy(obj_mapping, obj_to_rank_in_comm, num_objs_total*sizeof(int));
}
#ifdef __DEBUG__
dump_int_array( "Obj mapping : ", "", obj_mapping, num_objs_total );
dump_int_array(10, ompi_topo_base_framework.framework_output,
"Obj mapping : ", "", obj_mapping, num_objs_total );
#endif
} else {
if ( num_nodes > 1 ) {
Expand Down Expand Up @@ -539,7 +544,8 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
double **comm_pattern = NULL;

#ifdef __DEBUG__
dump_int_array("hierarchies : ", "", hierarchies, num_nodes*(TM_MAX_LEVELS+1));
dump_int_array(10, ompi_topo_base_framework.framework_output,
"hierarchies : ", "", hierarchies, num_nodes*(TM_MAX_LEVELS+1));
#endif
tm_topology = (tm_topology_t *)malloc(sizeof(tm_topology_t));
tm_topology->nb_levels = hierarchies[0];
Expand Down Expand Up @@ -574,10 +580,12 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
tm_topology->arity[i] = hierarchies[i+1];
}
free(hierarchies);
#ifdef __DEBUG__
for(i = 0; i < tm_topology->nb_levels; i++)
fprintf(stdout,"topo_arity[%i] = %i\n", i, tm_topology->arity[i]);
#endif

for(i = 0; i < tm_topology->nb_levels; i++) {
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"topo_arity[%i] = %i\n", i, tm_topology->arity[i]));
}

/* compute the number of processing elements */
tm_topology->nb_nodes = (size_t *)calloc(tm_topology->nb_levels, sizeof(size_t));
tm_topology->nb_nodes[0] = 1;
Expand Down Expand Up @@ -624,11 +632,13 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
tm_topology->oversub_fact = 1;

#ifdef __DEBUG__
assert(num_objs_total == tm_topology->nb_nodes[tm_topology->nb_levels-1]);
assert(num_objs_total == (int)tm_topology->nb_nodes[tm_topology->nb_levels-1]);

for(i = 0; i < tm_topology->nb_levels ; i++) {
fprintf(stdout,"tm topo node_id for level [%i] : ",i);
dump_int_array("", "", obj_mapping, tm_topology->nb_nodes[i]);
opal_output_verbose(10, ompi_topo_base_framework.framework_output,
"tm topo node_id for level [%i] : ",i);
dump_int_array(10, ompi_topo_base_framework.framework_output,
"", "", obj_mapping, tm_topology->nb_nodes[i]);
}
tm_display_topology(tm_topology);
#endif
Expand All @@ -643,27 +653,30 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
comm_pattern[j][i] = comm_pattern[i][j];
}
#ifdef __DEBUG__
fprintf(stdout,"==== COMM PATTERN ====\n");
opal_output_verbose(10, ompi_topo_base_framework.framework_output,
"==== COMM PATTERN ====\n");
for( i = 0 ; i < size ; i++) {
dump_double_array("", "", comm_pattern[i], size);
dump_double_array(10, ompi_topo_base_framework.framework_output,
"", "", comm_pattern[i], size);
}
#endif
tm_optimize_topology(&tm_topology);
aff_mat = tm_build_affinity_mat(comm_pattern,size);
comm_tree = tm_build_tree_from_topology(tm_topology,aff_mat, NULL, NULL);
sol = tm_compute_mapping(tm_topology, comm_tree);

assert((int)sol->k_length == size);

k = (int *)calloc(sol->k_length, sizeof(int));
for(idx = 0 ; idx < (int)sol->k_length ; idx++)
k[idx] = sol->k[idx][0];

#ifdef __DEBUG__
fprintf(stdout,"====> nb levels : %i\n",tm_topology->nb_levels);
dump_int_array("Rank permutation sigma/k : ", "", k, num_objs_total);
assert(size == sol->sigma_length);
dump_int_array("Matching : ", "",sol->sigma, sol->sigma_length);
opal_output_verbose(10, ompi_topo_base_framework.framework_output,
"====> nb levels : %i\n",tm_topology->nb_levels);
dump_int_array(10, ompi_topo_base_framework.framework_output,
"Rank permutation sigma/k : ", "", k, num_objs_total);
assert(size == (int)sol->sigma_length);
dump_int_array(10, ompi_topo_base_framework.framework_output,
"Matching : ", "",sol->sigma, sol->sigma_length);
#endif
free(obj_mapping);
free(comm_pattern);
Expand Down Expand Up @@ -725,9 +738,8 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,

/* Discover the local patterns */
if (rank == lindex_to_grank[0]) {
#ifdef __DEBUG__
fprintf(stderr,"========== Partially Distributed Reordering ========= \n");
#endif
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"========== Partially Distributed Reordering ========= \n"));
local_pattern = (double *)calloc(num_procs_in_node * num_procs_in_node, sizeof(double));
} else {
local_pattern = (double *)calloc(num_procs_in_node, sizeof(double));
Expand Down Expand Up @@ -773,12 +785,15 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
}

#ifdef __DEBUG__
fprintf(stdout,"========== COMM PATTERN ============= \n");
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"========== COMM PATTERN ============= \n"));
for(i = 0 ; i < num_procs_in_node ; i++){
fprintf(stdout," %i : ",i);
dump_double_array("", "", comm_pattern[i], num_procs_in_node);
opal_output_verbose(10, ompi_topo_base_framework.framework_output," %i : ",i);
dump_double_array(10, ompi_topo_base_framework.framework_output,
"", "", comm_pattern[i], num_procs_in_node);
}
fprintf(stdout,"======================= \n");
opal_output_verbose(10, ompi_topo_base_framework.framework_output,
"======================= \n");
#endif

tm_topology = (tm_topology_t *)malloc(sizeof(tm_topology_t));
Expand Down Expand Up @@ -818,11 +833,16 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
tm_topology->oversub_fact = 1;

#ifdef __DEBUG__
assert(num_objs_in_node == tm_topology->nb_nodes[tm_topology->nb_levels-1]);
fprintf(stdout,"Levels in topo : %i | num procs in node : %i\n",tm_topology->nb_levels,num_procs_in_node);
for(i = 0; i < tm_topology->nb_levels ; i++){
fprintf(stdout,"Nb objs for level %i : %i | arity %i\n ",i,tm_topology->nb_nodes[i],tm_topology->arity[i]);
dump_int_array("", "Obj id ", tm_topology->node_id[i], tm_topology->nb_nodes[i]);
assert(num_objs_in_node == (int)tm_topology->nb_nodes[tm_topology->nb_levels-1]);
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"Levels in topo : %i | num procs in node : %i\n",
tm_topology->nb_levels,num_procs_in_node));
for(i = 0; i < tm_topology->nb_levels ; i++) {
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"Nb objs for level %i : %lu | arity %i\n ",
i, tm_topology->nb_nodes[i],tm_topology->arity[i]));
dump_int_array(10, ompi_topo_base_framework.framework_output,
"", "Obj id ", tm_topology->node_id[i], tm_topology->nb_nodes[i]);
}
tm_display_topology(tm_topology);
#endif
Expand All @@ -831,17 +851,20 @@ int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
comm_tree = tm_build_tree_from_topology(tm_topology,aff_mat, NULL, NULL);
sol = tm_compute_mapping(tm_topology, comm_tree);

assert((int)sol->k_length == num_procs_in_node);
assert((int)sol->k_length == num_objs_in_node);

k = (int *)calloc(sol->k_length, sizeof(int));
for(idx = 0 ; idx < (int)sol->k_length ; idx++)
k[idx] = sol->k[idx][0];

#ifdef __DEBUG__
fprintf(stdout,"====> nb levels : %i\n",tm_topology->nb_levels);
dump_int_array("Rank permutation sigma/k : ", "", k, num_procs_in_node);
assert(num_procs_in_node == sol->sigma_length);
dump_int_array("Matching : ", "", sol->sigma, sol->sigma_length);
OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
"====> nb levels : %i\n",tm_topology->nb_levels));
dump_int_array(10, ompi_topo_base_framework.framework_output,
"Rank permutation sigma/k : ", "", k, num_procs_in_node);
assert(num_procs_in_node == (int)sol->sigma_length);
dump_int_array(10, ompi_topo_base_framework.framework_output,
"Matching : ", "", sol->sigma, sol->sigma_length);
#endif

free(aff_mat->sum_row);
Expand Down
Loading