diff --git a/ompi/communicator/Makefile.am b/ompi/communicator/Makefile.am index 6f57a3787f9..5bf7a4ed10b 100644 --- a/ompi/communicator/Makefile.am +++ b/ompi/communicator/Makefile.am @@ -12,7 +12,7 @@ # All rights reserved. # Copyright (c) 2013-2017 Los Alamos National Security, LLC. All rights # reserved. -# Copyright (c) 2014 Research Organization for Information Science +# Copyright (c) 2014-2018 Research Organization for Information Science # and Technology (RIST). All rights reserved. # Copyright (c) 2016 IBM Corporation. All rights reserved. # $COPYRIGHT$ @@ -31,6 +31,5 @@ headers += \ lib@OMPI_LIBMPI_NAME@_la_SOURCES += \ communicator/comm_init.c \ communicator/comm.c \ - communicator/comm_cid.c \ communicator/comm_request.c diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index 228abae7ab7..cf57d5ac0de 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -18,7 +18,7 @@ * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2012-2016 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014-2017 Research Organization for Information Science + * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2015 Mellanox Technologies. All rights reserved. @@ -49,6 +49,8 @@ #include "ompi/attribute/attribute.h" #include "ompi/communicator/communicator.h" +#include "ompi/mca/cid/cid.h" +#include "ompi/mca/cid/base/base.h" #include "ompi/mca/pml/pml.h" #include "ompi/request/request.h" @@ -358,7 +360,7 @@ int ompi_comm_create ( ompi_communicator_t *comm, ompi_group_t *group, } /* Determine context id. It is identical to f_2_c_handle */ - rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode); + rc = ompi_cid->nextcid (newcomp, comm, NULL, NULL, NULL, false, mode); if ( OMPI_SUCCESS != rc ) { goto exit; } @@ -368,7 +370,7 @@ int ompi_comm_create ( ompi_communicator_t *comm, ompi_group_t *group, newcomp->c_contextid, comm->c_contextid ); /* Activate the communicator and init coll-component */ - rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode); + rc = ompi_cid->activate (&newcomp, comm, NULL, NULL, NULL, false, mode); if ( OMPI_SUCCESS != rc ) { goto exit; } @@ -593,7 +595,7 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key, } /* set the rank to MPI_UNDEFINED. This prevents this process from interfering - * in ompi_comm_nextcid() and the collective module selection in ompi_comm_activate() + * in ompi_comm_nextcid() and the collective module selection in ompi_cid->activate() * for a communicator that will be freed anyway. */ if ( MPI_UNDEFINED == color || (inter && my_rsize==0)) { @@ -601,7 +603,7 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key, } /* Determine context id. It is identical to f_2_c_handle */ - rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode); + rc = ompi_cid->nextcid (newcomp, comm, NULL, NULL, NULL, false, mode); if ( OMPI_SUCCESS != rc ) { goto exit; } @@ -613,7 +615,7 @@ int ompi_comm_split( ompi_communicator_t* comm, int color, int key, /* Activate the communicator and init coll-component */ - rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode); + rc = ompi_cid->activate (&newcomp, comm, NULL, NULL, NULL, false, mode); exit: free ( results ); @@ -909,7 +911,7 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key, } /* Determine context id. It is identical to f_2_c_handle */ - rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode); + rc = ompi_cid->nextcid (newcomp, comm, NULL, NULL, NULL, false, mode); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { break; } @@ -921,7 +923,7 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key, } /* Activate the communicator and init coll-component */ - rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode); + rc = ompi_cid->activate (&newcomp, comm, NULL, NULL, NULL, false, mode); if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { break; } @@ -1004,7 +1006,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp } /* Determine context id. It is identical to f_2_c_handle */ - rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode); + rc = ompi_cid->nextcid (newcomp, comm, NULL, NULL, NULL, false, mode); if ( OMPI_SUCCESS != rc ) { return rc; } @@ -1020,7 +1022,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp } /* activate communicator and init coll-module */ - rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode); + rc = ompi_cid->activate (&newcomp, comm, NULL, NULL, NULL, false, mode); if ( OMPI_SUCCESS != rc ) { return rc; } @@ -1134,7 +1136,7 @@ static int ompi_comm_idup_getcid (ompi_comm_request_t *request) } /* Determine context id. It is identical to f_2_c_handle */ - rc = ompi_comm_nextcid_nb (context->newcomp, context->comm, NULL, NULL, + rc = ompi_cid->nextcid_nb (context->newcomp, context->comm, NULL, NULL, NULL, false, mode, subreq); if (OMPI_SUCCESS != rc) { ompi_comm_request_return (request); @@ -1164,7 +1166,7 @@ static int ompi_comm_idup_with_info_activate (ompi_comm_request_t *request) context->newcomp->c_contextid, context->comm->c_contextid ); /* activate communicator and init coll-module */ - rc = ompi_comm_activate_nb (&context->newcomp, context->comm, NULL, NULL, NULL, false, mode, subreq); + rc = ompi_cid->activate_nb (&context->newcomp, context->comm, NULL, NULL, NULL, false, mode, subreq); if ( OMPI_SUCCESS != rc ) { return rc; } @@ -1206,7 +1208,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int } /* Determine context id. It is identical to f_2_c_handle */ - rc = ompi_comm_nextcid (newcomp, comm, NULL, &tag, NULL, false, mode); + rc = ompi_cid->nextcid (newcomp, comm, NULL, &tag, NULL, false, mode); if ( OMPI_SUCCESS != rc ) { return rc; } @@ -1216,7 +1218,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int newcomp->c_contextid, comm->c_contextid ); /* activate communicator and init coll-module */ - rc = ompi_comm_activate (&newcomp, comm, NULL, &tag, NULL, false, mode); + rc = ompi_cid->activate (&newcomp, comm, NULL, &tag, NULL, false, mode); if ( OMPI_SUCCESS != rc ) { return rc; } @@ -1885,7 +1887,7 @@ int ompi_comm_enable(ompi_communicator_t *old_comm, int ret = OMPI_SUCCESS; /* Determine context id. It is identical to f_2_c_handle */ - ret = ompi_comm_nextcid (new_comm, old_comm, NULL, NULL, NULL, false, + ret = ompi_cid->nextcid (new_comm, old_comm, NULL, NULL, NULL, false, OMPI_COMM_CID_INTRA); if (OMPI_SUCCESS != ret) { /* something wrong happened while setting the communicator */ @@ -1909,7 +1911,7 @@ int ompi_comm_enable(ompi_communicator_t *old_comm, goto complete_and_return; } - ret = ompi_comm_activate (&new_comm, old_comm, NULL, NULL, NULL, false, + ret = ompi_cid->activate (&new_comm, old_comm, NULL, NULL, NULL, false, OMPI_COMM_CID_INTRA); if (OMPI_SUCCESS != ret) { /* something wrong happened while setting the communicator */ diff --git a/ompi/communicator/comm_init.c b/ompi/communicator/comm_init.c index 75aac4d49e3..9a5fa884e9d 100644 --- a/ompi/communicator/comm_init.c +++ b/ompi/communicator/comm_init.c @@ -18,7 +18,7 @@ * Copyright (c) 2011-2013 Inria. All rights reserved. * Copyright (c) 2011-2013 Universite Bordeaux 1 * All rights reserved. - * Copyright (c) 2015-2017 Research Organization for Information Science + * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. @@ -45,6 +45,7 @@ #include "ompi/attribute/attribute.h" #include "ompi/dpm/dpm.h" #include "ompi/memchecker.h" +#include "ompi/mca/cid/cid.h" /* ** Table for Fortran <-> C communicator handle conversion @@ -335,7 +336,7 @@ int ompi_comm_finalize(void) * the reference count by one more than other communicators, on order to * allow for deallocation with the parent communicator. Note, that * this only occurs if the cid of the local_comm is lower than of its - * parent communicator. Read the comment in comm_activate for + * parent communicator. Read the comment in ompi_cid_base_comm_activate() for * a full explanation. */ if ( ompi_debug_show_handle_leaks && !(OMPI_COMM_IS_FREED(comm)) ){ @@ -456,6 +457,7 @@ static void ompi_comm_destruct(ompi_communicator_t* comm) comm->c_contextid)) { opal_pointer_array_set_item ( &ompi_mpi_communicators, comm->c_contextid, NULL); + ompi_cid->release(comm->c_contextid); } /* reset the ompi_comm_f_to_c_table entry */ diff --git a/ompi/communicator/comm_request.c b/ompi/communicator/comm_request.c index 272fc33600b..d96781e4677 100644 --- a/ompi/communicator/comm_request.c +++ b/ompi/communicator/comm_request.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2013-2016 Los Alamos National Security, LLC. All rights * reseved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2004-2016 The University of Tennessee and The University * of Tennessee Research Foundation. All rights @@ -29,7 +29,8 @@ bool ompi_comm_request_initialized = false; typedef struct ompi_comm_request_item_t { opal_list_item_t super; ompi_comm_request_callback_fn_t callback; - ompi_request_t *subreqs[OMPI_COMM_REQUEST_MAX_SUBREQ]; + ompi_request_t *static_subreqs[OMPI_COMM_REQUEST_MAX_SUBREQ]; + ompi_request_t **subreqs; int subreq_count; } ompi_comm_request_item_t; OBJ_CLASS_DECLARATION(ompi_comm_request_item_t); @@ -74,15 +75,20 @@ int ompi_comm_request_schedule_append (ompi_comm_request_t *request, ompi_comm_r ompi_comm_request_item_t *request_item; int i; - if (subreq_count > OMPI_COMM_REQUEST_MAX_SUBREQ) { - return OMPI_ERR_BAD_PARAM; - } - request_item = OBJ_NEW(ompi_comm_request_item_t); if (NULL == request_item) { return OMPI_ERR_OUT_OF_RESOURCE; } + if (subreq_count > OMPI_COMM_REQUEST_MAX_SUBREQ) { + ompi_request_t ** reqs = (ompi_request_t **)malloc(subreq_count * sizeof(ompi_request_t *)); + if (NULL == reqs) { + OBJ_RELEASE(request_item); + return OMPI_ERR_OUT_OF_RESOURCE; + } + request_item->subreqs = reqs; + } + request_item->callback = callback; for (i = 0 ; i < subreq_count ; ++i) { @@ -241,7 +247,19 @@ OBJ_CLASS_INSTANCE(ompi_comm_request_t, ompi_request_t, ompi_comm_request_construct, ompi_comm_request_destruct); -OBJ_CLASS_INSTANCE(ompi_comm_request_item_t, opal_list_item_t, NULL, NULL); +static void ompi_comm_request_item_construct (ompi_comm_request_item_t *request) { + request->subreqs = request->static_subreqs; +} + +static void ompi_comm_request_item_destruct (ompi_comm_request_item_t *request) { + if (request->static_subreqs != request->subreqs) { + free(request->subreqs); + } +} + +OBJ_CLASS_INSTANCE(ompi_comm_request_item_t, opal_list_item_t, + ompi_comm_request_item_construct, + ompi_comm_request_item_destruct); ompi_comm_request_t *ompi_comm_request_get (void) { diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h index 4fe4721244c..2297790320a 100644 --- a/ompi/communicator/communicator.h +++ b/ompi/communicator/communicator.h @@ -18,7 +18,7 @@ * Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -92,6 +92,7 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_communicator_t); #define OMPI_COMM_ALLGATHER_TAG -31078 #define OMPI_COMM_BARRIER_TAG -31079 #define OMPI_COMM_ALLREDUCE_TAG -31080 +#define OMPI_COMM_LOCAL_TAG -31081 #define OMPI_COMM_ASSERT_NO_ANY_TAG 0x00000001 #define OMPI_COMM_ASSERT_NO_ANY_SOURCE 0x00000002 @@ -537,50 +538,6 @@ OMPI_DECLSPEC int ompi_comm_free (ompi_communicator_t **comm); ompi_communicator_t* ompi_comm_allocate (int local_group_size, int remote_group_size); -/** - * allocate new communicator ID - * @param newcomm: pointer to the new communicator - * @param oldcomm: original comm - * @param bridgecomm: bridge comm for intercomm_create - * @param mode: combination of input - * OMPI_COMM_CID_INTRA: intra-comm - * OMPI_COMM_CID_INTER: inter-comm - * OMPI_COMM_CID_GROUP: only decide CID within the ompi_group_t - * associated with the communicator. arg0 - * must point to an int which will be used - * as the pml tag for communication. - * OMPI_COMM_CID_INTRA_BRIDGE: 2 intracomms connected by - * a bridge comm. arg0 and arg1 must point - * to integers representing the local and - * remote leader ranks. the remote leader rank - * is a rank in the bridgecomm. - * OMPI_COMM_CID_INTRA_PMIX: 2 intracomms, leaders talk - * through PMIx. arg0 must point to an integer - * representing the local leader rank. arg1 - * must point to a string representing the - * port of the remote leader. - * @param send_first: to avoid a potential deadlock for - * the OOB version. - * This routine has to be thread safe in the final version. - */ -OMPI_DECLSPEC int ompi_comm_nextcid (ompi_communicator_t *newcomm, ompi_communicator_t *comm, - ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, - bool send_first, int mode); - -/** - * allocate new communicator ID (non-blocking) - * @param newcomm: pointer to the new communicator - * @param oldcomm: original comm - * @param bridgecomm: bridge comm for intercomm_create - * @param mode: combination of input - * OMPI_COMM_CID_INTRA: intra-comm - * OMPI_COMM_CID_INTER: inter-comm - * This routine has to be thread safe in the final version. - */ -OMPI_DECLSPEC int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm, - ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, - bool send_first, int mode, ompi_request_t **req); - /** * shut down the communicator infrastructure. */ @@ -672,26 +629,6 @@ int ompi_comm_determine_first ( ompi_communicator_t *intercomm, int high ); -OMPI_DECLSPEC int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm, - ompi_communicator_t *bridgecomm, const void *arg0, - const void *arg1, bool send_first, int mode); - -/** - * Non-blocking variant of comm_activate. - * - * @param[inout] newcomm New communicator - * @param[in] comm Parent communicator - * @param[in] bridgecomm Bridge communicator (used for PMIX and bridge modes) - * @param[in] arg0 Mode argument 0 - * @param[in] arg1 Mode argument 1 - * @param[in] send_first Send first from this process (PMIX mode only) - * @param[in] mode Collective mode - * @param[out] req New request object to track this operation - */ -OMPI_DECLSPEC int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *comm, - ompi_communicator_t *bridgecomm, const void *arg0, - const void *arg1, bool send_first, int mode, ompi_request_t **req); - /** * a simple function to dump the structure */ @@ -704,14 +641,6 @@ int ompi_comm_set_name (ompi_communicator_t *comm, const char *name ); extern int ompi_comm_num_dyncomm; -/* check whether any of the processes has requested support for - MPI_THREAD_MULTIPLE. Note, that this produces global - information across MPI_COMM_WORLD, in contrary to the local - flag ompi_mpi_thread_provided -*/ -OMPI_DECLSPEC int ompi_comm_cid_init ( void ); - - void ompi_comm_assert_subscribe (ompi_communicator_t *comm, int32_t assert_flag); END_C_DECLS diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index dfc7efc747f..d59baccbf53 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -16,7 +16,7 @@ * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2017 Research Organization for Information Science + * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -46,6 +46,8 @@ #include "opal/mca/pmix/pmix.h" #include "ompi/communicator/communicator.h" +#include "ompi/mca/cid/cid.h" +#include "ompi/mca/cid/base/base.h" #include "ompi/group/group.h" #include "ompi/proc/proc.h" #include "ompi/mca/pml/pml.h" @@ -475,7 +477,19 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, new_group_pointer = MPI_GROUP_NULL; /* allocate comm_cid */ - rc = ompi_comm_nextcid ( newcomp, /* new communicator */ + rc = ompi_cid->nextcid (newcomp, /* new communicator */ + comm, /* old communicator */ + NULL, /* bridge comm */ + &root, /* local leader */ + (void*)port_string, /* rendezvous point */ + send_first, /* send or recv first */ + OMPI_COMM_CID_INTRA_PMIX); /* mode */ + if (OMPI_SUCCESS != rc) { + goto exit; + } + + /* activate comm and init coll-component */ + rc = ompi_cid->activate (&newcomp, /* new communicator */ comm, /* old communicator */ NULL, /* bridge comm */ &root, /* local leader */ @@ -486,18 +500,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, goto exit; } - /* activate comm and init coll-component */ - rc = ompi_comm_activate ( &newcomp, /* new communicator */ - comm, /* old communicator */ - NULL, /* bridge comm */ - &root, /* local leader */ - (void*)port_string, /* rendezvous point */ - send_first, /* send or recv first */ - OMPI_COMM_CID_INTRA_PMIX); /* mode */ - if (OMPI_SUCCESS != rc) { - goto exit; - } - /* Question: do we have to re-start some low level stuff to enable the usage of fast communication devices between the two worlds ? diff --git a/ompi/mca/cid/Makefile.am b/ompi/mca/cid/Makefile.am new file mode 100644 index 00000000000..2068ac2f70a --- /dev/null +++ b/ompi/mca/cid/Makefile.am @@ -0,0 +1,29 @@ +# +# Copyright (c) 2018 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# main library setup +noinst_LTLIBRARIES = libmca_cid.la +libmca_cid_la_SOURCES = + +# local files +headers = cid.h +libmca_cid_la_SOURCES += $(headers) $(nodist_headers) + +# Conditionally install the header files +if WANT_INSTALL_HEADERS +ompidir = $(ompiincludedir)/$(subdir) +nobase_ompi_HEADERS = $(headers) +nobase_nodist_ompi_HEADERS = $(nodist_headers) +endif + +include base/Makefile.am + +distclean-local: + rm -f base/static-components.h diff --git a/ompi/mca/cid/base/Makefile.am b/ompi/mca/cid/base/Makefile.am new file mode 100644 index 00000000000..347cc69b029 --- /dev/null +++ b/ompi/mca/cid/base/Makefile.am @@ -0,0 +1,17 @@ +# +# Copyright (c) 2018 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +headers += \ + base/base.h + +libmca_cid_la_SOURCES += \ + base/cid_base_frame.c \ + base/cid_base_select.c \ + base/cid_base_stubs.c diff --git a/ompi/mca/cid/base/base.h b/ompi/mca/cid/base/base.h new file mode 100644 index 00000000000..67df322cbdd --- /dev/null +++ b/ompi/mca/cid/base/base.h @@ -0,0 +1,111 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_CID_BASE_H +#define MCA_CID_BASE_H + +#include "ompi_config.h" + +#include "ompi/mca/mca.h" +#include "opal/mca/base/mca_base_framework.h" +#include "opal/class/opal_list.h" +#include "opal/class/opal_pointer_array.h" + +#include "ompi/mca/cid/cid.h" + +/* + * Global functions for the PML + */ + +BEGIN_C_DECLS + +/* + * MCA framework + */ +OMPI_DECLSPEC extern mca_base_framework_t ompi_cid_base_framework; +/* select all components */ +OMPI_DECLSPEC int ompi_cid_base_select(bool ompi_mpi_thread_multiple); + +struct ompi_cid_base_cid_context_t; + +typedef int (*ompi_cid_base_allreduce_impl_fn_t) (int *inbuf, int *outbuf, int count, struct ompi_op_t *op, + struct ompi_cid_base_cid_context_t *cid_context, + ompi_request_t **req); + +struct ompi_cid_base_cid_context_t { + opal_object_t super; + + ompi_communicator_t *newcomm; + ompi_communicator_t **newcommp; + ompi_communicator_t *comm; + ompi_communicator_t *bridgecomm; + + ompi_cid_base_allreduce_impl_fn_t allreduce_fn; + + int nextcid; + int nextlocal_cid; + int start; + int flag, rflag; + int local_leader; + int remote_leader; + int iter; + /** storage for activate barrier */ + int ok; + char *port_string; + bool send_first; + int pml_tag; + char *pmix_tag; +}; + +typedef struct ompi_cid_base_cid_context_t ompi_cid_base_cid_context_t; + +OBJ_CLASS_DECLARATION(ompi_cid_base_cid_context_t); + +OMPI_DECLSPEC int ompi_cid_base_cid_context_init(ompi_cid_base_cid_context_t *context, + ompi_communicator_t *newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, + const void *arg1, const char *pmix_tag, bool send_first, + int mode); + +OMPI_DECLSPEC int ompi_cid_base_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, + const void *arg1, bool send_first, int mode); + +/** + * Non-blocking variant of comm_activate. + * + * @param[inout] newcomm New communicator + * @param[in] comm Parent communicator + * @param[in] bridgecomm Bridge communicator (used for PMIX and bridge modes) + * @param[in] arg0 Mode argument 0 + * @param[in] arg1 Mode argument 1 + * @param[in] send_first Send first from this process (PMIX mode only) + * @param[in] mode Collective mode + * @param[out] req New request object to track this operation + */ +OMPI_DECLSPEC int ompi_cid_base_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, + const void *arg1, bool send_first, int mode, ompi_request_t **req); + +END_C_DECLS + +#endif /* MCA_CID_BASE_H */ diff --git a/ompi/mca/cid/base/cid_base_frame.c b/ompi/mca/cid/base/cid_base_frame.c new file mode 100644 index 00000000000..1d0fd422f10 --- /dev/null +++ b/ompi/mca/cid/base/cid_base_frame.c @@ -0,0 +1,68 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + + +#include "ompi_config.h" +#include + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNIST_H */ +#include "ompi/mca/mca.h" +#include "opal/util/output.h" +#include "opal/mca/base/base.h" + + +#include "ompi/constants.h" +#include "ompi/mca/cid/cid.h" +#include "ompi/mca/cid/base/base.h" + +/* + * The following file was created by configure. It contains extern + * statements and the definition of an array of pointers to each + * component's public mca_base_component_t struct. + */ + +#include "ompi/mca/cid/base/static-components.h" + +/* + * Global variables + */ +ompi_cid_base_module_t *ompi_cid = NULL; + +static int ompi_cid_base_close(void) +{ + /* give the selected module a chance to finalize */ + if (NULL != ompi_cid->finalize) { + ompi_cid->finalize(); + } + return mca_base_framework_components_close(&ompi_cid_base_framework, NULL); +} + +/** + * Function for finding and opening either all MCA components, or the one + * that was specifically requested via a MCA parameter. + */ +static int ompi_cid_base_open(mca_base_open_flag_t flags) +{ + int rc; + + /* Open up all available components */ + rc = mca_base_framework_components_open(&ompi_cid_base_framework, flags); + + /* All done */ + return rc; +} + +MCA_BASE_FRAMEWORK_DECLARE(ompi, cid, "OMPI CID", NULL, + ompi_cid_base_open, ompi_cid_base_close, + mca_cid_base_static_components, 0); diff --git a/ompi/mca/cid/base/cid_base_select.c b/ompi/mca/cid/base/cid_base_select.c new file mode 100644 index 00000000000..36dc8583381 --- /dev/null +++ b/ompi/mca/cid/base/cid_base_select.c @@ -0,0 +1,135 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include + +#include "opal/class/opal_list.h" +#include "opal/util/output.h" +#include "opal/util/show_help.h" +#include "opal/runtime/opal_progress.h" +#include "ompi/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/runtime/opal.h" + +#include "ompi/constants.h" +#include "ompi/mca/cid/cid.h" +#include "ompi/mca/cid/base/base.h" + +/** + * Function for selecting one component from all those that are + * available. + */ +int ompi_cid_base_select(bool ompi_mpi_thread_multiple) +{ + ompi_cid_base_component_t *best_component = NULL; + mca_base_component_list_item_t *cli = NULL; + ompi_cid_base_component_t *component = NULL; + ompi_cid_base_module_t *module = NULL; + int priority = 0, best_priority = INT32_MIN; + int rc; + + best_component = NULL; + + opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, ompi_cid_base_framework.framework_output, + "mca:base:select: Auto-selecting cid components"); + + /* + * Traverse the list of available components. + * For each call their 'query' functions to determine relative priority. + */ + OPAL_LIST_FOREACH(cli, &ompi_cid_base_framework.framework_components, mca_base_component_list_item_t) { + component = (ompi_cid_base_component_t *) cli->cli_component; + + /* + * If there is a query function then use it. + */ + if (NULL == component->query) { + opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, ompi_cid_base_framework.framework_output, + "mca:base:select:(cid) Skipping component [%s]. It does not implement a query function", + component->base_version.mca_component_name ); + continue; + } + + /* + * Query this component for the module and priority + */ + opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, ompi_cid_base_framework.framework_output, + "mca:base:select:(cid) Querying component [%s]", + component->base_version.mca_component_name); + + rc = component->query(&module, &priority, ompi_mpi_thread_multiple); + if (OPAL_ERR_FATAL == rc) { + /* a fatal error was detected by this component - e.g., the + * user specified a required element and the component could + * not find it. In this case, we must not continue as we might + * find some other component that could run, causing us to do + * something the user didn't want */ + return rc; + } else if (OPAL_SUCCESS != rc) { + /* silently skip this component */ + continue; + } + + /* + * If no module was returned, then skip component + */ + if (NULL == module) { + opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, ompi_cid_base_framework.framework_output, + "mca:base:select:(cid) Skipping component [%s]. Query failed to return a module", + component->base_version.mca_component_name ); + continue; + } + + /* + * Determine if this is the best module we have seen by looking the priority + */ + opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, ompi_cid_base_framework.framework_output, + "mca:base:select:(cid) Query of component [%s] set priority to %d", + component->base_version.mca_component_name, priority); + if (priority > best_priority) { + best_priority = priority; + best_component = component; + ompi_cid = module; + } + } + + /* + * Finished querying all components. + * Make sure we found something in the process. + */ + if (NULL == best_component) { + opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, ompi_cid_base_framework.framework_output, + "mca:base:select:(cid) No component selected!"); + /* + * Still close the non-selected components + */ + mca_base_components_close(0, /* Pass 0 to keep this from closing the output handle */ + &ompi_cid_base_framework.framework_components, + NULL); + return OPAL_ERR_NOT_FOUND; + } + + opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, ompi_cid_base_framework.framework_output, + "mca:base:select:(cid) Selected component [%s]", + best_component->base_version.mca_component_name); + + /* + * Close the non-selected components + */ + mca_base_components_close(ompi_cid_base_framework.framework_output, + &ompi_cid_base_framework.framework_components, + &best_component->base_version); + + + return OPAL_SUCCESS; +} diff --git a/ompi/communicator/comm_cid.c b/ompi/mca/cid/base/cid_base_stubs.c similarity index 73% rename from ompi/communicator/comm_cid.c rename to ompi/mca/cid/base/cid_base_stubs.c index 764fe42f4e7..d10be96e895 100644 --- a/ompi/communicator/comm_cid.c +++ b/ompi/mca/cid/base/cid_base_stubs.c @@ -18,7 +18,7 @@ * reserved. * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2016 Research Organization for Information Science + * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. * Copyright (c) 2017 Mellanox Technologies. All rights reserved. @@ -45,85 +45,7 @@ #include "ompi/mca/coll/base/base.h" #include "ompi/request/request.h" #include "ompi/runtime/mpiruntime.h" - -struct ompi_comm_cid_context_t; - -typedef int (*ompi_comm_allreduce_impl_fn_t) (int *inbuf, int *outbuf, int count, struct ompi_op_t *op, - struct ompi_comm_cid_context_t *cid_context, - ompi_request_t **req); - - -struct ompi_comm_cid_context_t { - opal_object_t super; - - ompi_communicator_t *newcomm; - ompi_communicator_t **newcommp; - ompi_communicator_t *comm; - ompi_communicator_t *bridgecomm; - - ompi_comm_allreduce_impl_fn_t allreduce_fn; - - int nextcid; - int nextlocal_cid; - int start; - int flag, rflag; - int local_leader; - int remote_leader; - int iter; - /** storage for activate barrier */ - int ok; - char *port_string; - bool send_first; - int pml_tag; - char *pmix_tag; -}; - -typedef struct ompi_comm_cid_context_t ompi_comm_cid_context_t; - -static void mca_comm_cid_context_construct (ompi_comm_cid_context_t *context) -{ - memset ((void *) ((intptr_t) context + sizeof (context->super)), 0, sizeof (*context) - sizeof (context->super)); -} - -static void mca_comm_cid_context_destruct (ompi_comm_cid_context_t *context) -{ - free (context->port_string); - free (context->pmix_tag); -} - -OBJ_CLASS_INSTANCE (ompi_comm_cid_context_t, opal_object_t, - mca_comm_cid_context_construct, - mca_comm_cid_context_destruct); - -struct ompi_comm_allreduce_context_t { - opal_object_t super; - - int *inbuf; - int *outbuf; - int count; - struct ompi_op_t *op; - ompi_comm_cid_context_t *cid_context; - int *tmpbuf; - - /* for group allreduce */ - int peers_comm[3]; -}; - -typedef struct ompi_comm_allreduce_context_t ompi_comm_allreduce_context_t; - -static void ompi_comm_allreduce_context_construct (ompi_comm_allreduce_context_t *context) -{ - memset ((void *) ((intptr_t) context + sizeof (context->super)), 0, sizeof (*context) - sizeof (context->super)); -} - -static void ompi_comm_allreduce_context_destruct (ompi_comm_allreduce_context_t *context) -{ - free (context->tmpbuf); -} - -OBJ_CLASS_INSTANCE (ompi_comm_allreduce_context_t, opal_object_t, - ompi_comm_allreduce_context_construct, - ompi_comm_allreduce_context_destruct); +#include "ompi/mca/cid/base/base.h" /** * These functions make sure, that we determine the global result over @@ -134,46 +56,32 @@ OBJ_CLASS_INSTANCE (ompi_comm_allreduce_context_t, opal_object_t, /* non-blocking intracommunicator allreduce */ static int ompi_comm_allreduce_intra_nb (int *inbuf, int *outbuf, int count, - struct ompi_op_t *op, ompi_comm_cid_context_t *cid_context, + struct ompi_op_t *op, ompi_cid_base_cid_context_t *cid_context, ompi_request_t **req); /* non-blocking intercommunicator allreduce */ static int ompi_comm_allreduce_inter_nb (int *inbuf, int *outbuf, int count, - struct ompi_op_t *op, ompi_comm_cid_context_t *cid_context, + struct ompi_op_t *op, ompi_cid_base_cid_context_t *cid_context, ompi_request_t **req); static int ompi_comm_allreduce_group_nb (int *inbuf, int *outbuf, int count, - struct ompi_op_t *op, ompi_comm_cid_context_t *cid_context, + struct ompi_op_t *op, ompi_cid_base_cid_context_t *cid_context, ompi_request_t **req); static int ompi_comm_allreduce_intra_pmix_nb (int *inbuf, int *outbuf, int count, - struct ompi_op_t *op, ompi_comm_cid_context_t *cid_context, + struct ompi_op_t *op, ompi_cid_base_cid_context_t *cid_context, ompi_request_t **req); static int ompi_comm_allreduce_intra_bridge_nb (int *inbuf, int *outbuf, int count, - struct ompi_op_t *op, ompi_comm_cid_context_t *cid_context, + struct ompi_op_t *op, ompi_cid_base_cid_context_t *cid_context, ompi_request_t **req); -static opal_mutex_t ompi_cid_lock = OPAL_MUTEX_STATIC_INIT; - - -int ompi_comm_cid_init (void) +int ompi_cid_base_cid_context_init (ompi_cid_base_cid_context_t *context, + ompi_communicator_t *newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, + const void *arg1, const char *pmix_tag, bool send_first, + int mode) { - return OMPI_SUCCESS; -} - -static ompi_comm_cid_context_t *mca_comm_cid_context_alloc (ompi_communicator_t *newcomm, ompi_communicator_t *comm, - ompi_communicator_t *bridgecomm, const void *arg0, - const void *arg1, const char *pmix_tag, bool send_first, - int mode) -{ - ompi_comm_cid_context_t *context; - - context = OBJ_NEW(ompi_comm_cid_context_t); - if (OPAL_UNLIKELY(NULL == context)) { - return NULL; - } - context->newcomm = newcomm; context->comm = comm; context->bridgecomm = bridgecomm; @@ -206,425 +114,71 @@ static ompi_comm_cid_context_t *mca_comm_cid_context_alloc (ompi_communicator_t context->remote_leader = ((int *) arg1)[0]; break; default: - OBJ_RELEASE(context); - return NULL; + return OMPI_ERROR; } context->send_first = send_first; context->iter = 0; context->ok = 1; - return context; -} - -static ompi_comm_allreduce_context_t *ompi_comm_allreduce_context_alloc (int *inbuf, int *outbuf, - int count, struct ompi_op_t *op, - ompi_comm_cid_context_t *cid_context) -{ - ompi_comm_allreduce_context_t *context; - - context = OBJ_NEW(ompi_comm_allreduce_context_t); - if (OPAL_UNLIKELY(NULL == context)) { - return NULL; - } - - context->inbuf = inbuf; - context->outbuf = outbuf; - context->count = count; - context->op = op; - context->cid_context = cid_context; - - return context; -} - -/* find the next available local cid and start an allreduce */ -static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request); -/* verify that the maximum cid is locally available and start an allreduce */ -static int ompi_comm_checkcid (ompi_comm_request_t *request); -/* verify that the cid was available globally */ -static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request); - -static volatile int64_t ompi_comm_cid_lowest_id = INT64_MAX; - -int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm, - ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, - bool send_first, int mode, ompi_request_t **req) -{ - ompi_comm_cid_context_t *context; - ompi_comm_request_t *request; - - context = mca_comm_cid_context_alloc (newcomm, comm, bridgecomm, arg0, arg1, - "nextcid", send_first, mode); - if (NULL == context) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - context->start = ompi_mpi_communicators.lowest_free; - - request = ompi_comm_request_get (); - if (NULL == request) { - OBJ_RELEASE(context); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - request->context = &context->super; - - ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0); - ompi_comm_request_start (request); - - *req = &request->super; - - return OMPI_SUCCESS; } -int ompi_comm_nextcid (ompi_communicator_t *newcomm, ompi_communicator_t *comm, - ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, - bool send_first, int mode) -{ - ompi_request_t *req; - int rc; - - rc = ompi_comm_nextcid_nb (newcomm, comm, bridgecomm, arg0, arg1, send_first, mode, &req); - if (OMPI_SUCCESS != rc) { - return rc; - } - - ompi_request_wait_completion (req); - rc = req->req_status.MPI_ERROR; - ompi_comm_request_return ((ompi_comm_request_t *) req); - - return rc; -} - -static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request) -{ - ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context; - int64_t my_id = ((int64_t) ompi_comm_get_cid (context->comm) << 32 | context->pml_tag); - ompi_request_t *subreq; - bool flag; - int ret; - int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED); - - if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { - return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0); - } - - if (ompi_comm_cid_lowest_id < my_id) { - OPAL_THREAD_UNLOCK(&ompi_cid_lock); - return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0); - } - - ompi_comm_cid_lowest_id = my_id; - - /** - * This is the real algorithm described in the doc - */ - if( participate ){ - flag = false; - context->nextlocal_cid = mca_pml.pml_max_contextid; - for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) { - flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, - context->comm); - if (true == flag) { - context->nextlocal_cid = i; - break; - } - } - } else { - context->nextlocal_cid = 0; - } - - ret = context->allreduce_fn (&context->nextlocal_cid, &context->nextcid, 1, MPI_MAX, - context, &subreq); - /* there was a failure during non-blocking collective - * all we can do is abort - */ - if (OMPI_SUCCESS != ret) { - goto err_exit; - } - - if ( ((unsigned int) context->nextlocal_cid == mca_pml.pml_max_contextid) ) { - /* Our local CID space is out, others already aware (allreduce above) */ - ret = OMPI_ERR_OUT_OF_RESOURCE; - goto err_exit; - } - OPAL_THREAD_UNLOCK(&ompi_cid_lock); - - /* next we want to verify that the resulting commid is ok */ - return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, &subreq, 1); -err_exit: - if (participate && flag) { - opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL); - } - ompi_comm_cid_lowest_id = INT64_MAX; - OPAL_THREAD_UNLOCK(&ompi_cid_lock); - return ret; - -} - -static int ompi_comm_checkcid (ompi_comm_request_t *request) -{ - ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context; - ompi_request_t *subreq; - int ret; - int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED); - - if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { - return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0); - } - - if( !participate ){ - context->flag = 1; - } else { - context->flag = (context->nextcid == context->nextlocal_cid); - if ( participate && !context->flag) { - opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL); - - context->flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, - context->nextcid, context->comm); - } - } - - ++context->iter; - - ret = context->allreduce_fn (&context->flag, &context->rflag, 1, MPI_MIN, context, &subreq); - if (OMPI_SUCCESS == ret) { - ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, &subreq, 1); - } else { - if (participate && context->flag ) { - opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL); - } - ompi_comm_cid_lowest_id = INT64_MAX; - } - - OPAL_THREAD_UNLOCK(&ompi_cid_lock); - return ret; -} - -static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request) -{ - ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context; - int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED); - - if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { - return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0); - } - - if (1 == context->rflag) { - if( !participate ) { - /* we need to provide something sane here - * but we cannot use `nextcid` as we may have it - * in-use, go ahead with next locally-available CID - */ - context->nextlocal_cid = mca_pml.pml_max_contextid; - for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) { - bool flag; - flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, - context->comm); - if (true == flag) { - context->nextlocal_cid = i; - break; - } - } - context->nextcid = context->nextlocal_cid; - } - - /* set the according values to the newcomm */ - context->newcomm->c_contextid = context->nextcid; - opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, context->newcomm); - - /* unlock the cid generator */ - ompi_comm_cid_lowest_id = INT64_MAX; - OPAL_THREAD_UNLOCK(&ompi_cid_lock); - - /* done! */ - return OMPI_SUCCESS; - } - - if (participate && (1 == context->flag)) { - /* we could use this cid, but other don't agree */ - opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, NULL); - context->start = context->nextcid + 1; /* that's where we can start the next round */ - } - - ++context->iter; - - OPAL_THREAD_UNLOCK(&ompi_cid_lock); - - /* try again */ - return ompi_comm_allreduce_getnextcid (request); -} - -/**************************************************************************/ -/**************************************************************************/ -/**************************************************************************/ -/* This routine serves two purposes: - * - the allreduce acts as a kind of Barrier, - * which avoids, that we have incoming fragments - * on the new communicator before everybody has set - * up the comm structure. - * - some components (e.g. the collective MagPIe component - * might want to generate new communicators and communicate - * using the new comm. Thus, it can just be called after - * the 'barrier'. - * - * The reason that this routine is in comm_cid and not in - * comm.c is, that this file contains the allreduce implementations - * which are required, and thus we avoid having duplicate code... - */ - -/* Non-blocking version of ompi_comm_activate */ -static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request); - -int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *comm, - ompi_communicator_t *bridgecomm, const void *arg0, - const void *arg1, bool send_first, int mode, ompi_request_t **req) -{ - ompi_comm_cid_context_t *context; - ompi_comm_request_t *request; - ompi_request_t *subreq; - int ret = 0; - - context = mca_comm_cid_context_alloc (*newcomm, comm, bridgecomm, arg0, arg1, "activate", - send_first, mode); - if (NULL == context) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - - /* keep track of the pointer so it can be set to MPI_COMM_NULL on failure */ - context->newcommp = newcomm; - - request = ompi_comm_request_get (); - if (NULL == request) { - OBJ_RELEASE(context); - return OMPI_ERR_OUT_OF_RESOURCE; - } - - request->context = &context->super; - - if (MPI_UNDEFINED != (*newcomm)->c_local_group->grp_my_rank) { - /* Initialize the PML stuff in the newcomm */ - if ( OMPI_SUCCESS != (ret = MCA_PML_CALL(add_comm(*newcomm))) ) { - OBJ_RELEASE(*newcomm); - OBJ_RELEASE(context); - *newcomm = MPI_COMM_NULL; - return ret; - } - OMPI_COMM_SET_PML_ADDED(*newcomm); - } - - /* Step 1: the barrier, after which it is allowed to - * send messages over the new communicator - */ - ret = context->allreduce_fn (&context->ok, &context->ok, 1, MPI_MIN, context, - &subreq); - if (OMPI_SUCCESS != ret) { - ompi_comm_request_return (request); - return ret; - } - - ompi_comm_request_schedule_append (request, ompi_comm_activate_nb_complete, &subreq, 1); - ompi_comm_request_start (request); - - *req = &request->super; - - return OMPI_SUCCESS; -} +struct ompi_comm_allreduce_context_t { + opal_object_t super; -int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm, - ompi_communicator_t *bridgecomm, const void *arg0, - const void *arg1, bool send_first, int mode) -{ - ompi_request_t *req; - int rc; + int *inbuf; + int *outbuf; + int count; + struct ompi_op_t *op; + ompi_cid_base_cid_context_t *cid_context; + int *tmpbuf; - rc = ompi_comm_activate_nb (newcomm, comm, bridgecomm, arg0, arg1, send_first, mode, &req); - if (OMPI_SUCCESS != rc) { - return rc; - } + /* for group allreduce */ + int peers_comm[3]; +}; - ompi_request_wait_completion (req); - rc = req->req_status.MPI_ERROR; - ompi_comm_request_return ((ompi_comm_request_t *) req); +typedef struct ompi_comm_allreduce_context_t ompi_comm_allreduce_context_t; - return rc; +static void ompi_comm_allreduce_context_construct (ompi_comm_allreduce_context_t *context) +{ + memset ((void *) ((intptr_t) context + sizeof (context->super)), 0, sizeof (*context) - sizeof (context->super)); } -static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request) +static void ompi_comm_allreduce_context_destruct (ompi_comm_allreduce_context_t *context) { - ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context; - int ret; + free (context->tmpbuf); +} - /** - * Check to see if this process is in the new communicator. - * - * Specifically, this function is invoked by all proceses in the - * old communicator, regardless of whether they are in the new - * communicator or not. This is because it is far simpler to use - * MPI collective functions on the old communicator to determine - * some data for the new communicator (e.g., remote_leader) than - * to kludge up our own pseudo-collective routines over just the - * processes in the new communicator. Hence, *all* processes in - * the old communicator need to invoke this function. - * - * That being said, only processes in the new communicator need to - * select a coll module for the new communicator. More - * specifically, proceses who are not in the new communicator - * should *not* select a coll module -- for example, - * ompi_comm_rank(newcomm) returns MPI_UNDEFINED for processes who - * are not in the new communicator. This can cause errors in the - * selection / initialization of a coll module. Plus, it's - * wasteful -- processes in the new communicator will end up - * freeing the new communicator anyway, so we might as well leave - * the coll selection as NULL (the coll base comm unselect code - * handles that case properly). - */ - if (MPI_UNDEFINED == (context->newcomm)->c_local_group->grp_my_rank) { - return OMPI_SUCCESS; - } +static OBJ_CLASS_INSTANCE (ompi_comm_allreduce_context_t, opal_object_t, + ompi_comm_allreduce_context_construct, + ompi_comm_allreduce_context_destruct); - /* Let the collectives components fight over who will do - collective on this new comm. */ - if (OMPI_SUCCESS != (ret = mca_coll_base_comm_select(context->newcomm))) { - OBJ_RELEASE(context->newcomm); - *context->newcommp = MPI_COMM_NULL; - return ret; - } +static ompi_comm_allreduce_context_t *ompi_comm_allreduce_context_alloc (int *inbuf, int *outbuf, + int count, struct ompi_op_t *op, + ompi_cid_base_cid_context_t *cid_context) +{ + ompi_comm_allreduce_context_t *context; - /* For an inter communicator, we have to deal with the potential - * problem of what is happening if the local_comm that we created - * has a lower CID than the parent comm. This is not a problem - * as long as the user calls MPI_Comm_free on the inter communicator. - * However, if the communicators are not freed by the user but released - * by Open MPI in MPI_Finalize, we walk through the list of still available - * communicators and free them one by one. Thus, local_comm is freed before - * the actual inter-communicator. However, the local_comm pointer in the - * inter communicator will still contain the 'previous' address of the local_comm - * and thus this will lead to a segmentation violation. In order to prevent - * that from happening, we increase the reference counter local_comm - * by one if its CID is lower than the parent. We cannot increase however - * its reference counter if the CID of local_comm is larger than - * the CID of the inter communicators, since a regular MPI_Comm_free would - * leave in that the case the local_comm hanging around and thus we would not - * recycle CID's properly, which was the reason and the cause for this trouble. - */ - if (OMPI_COMM_IS_INTER(context->newcomm)) { - if (OMPI_COMM_CID_IS_LOWER(context->newcomm, context->comm)) { - OMPI_COMM_SET_EXTRA_RETAIN (context->newcomm); - OBJ_RETAIN (context->newcomm); - } + context = OBJ_NEW(ompi_comm_allreduce_context_t); + if (OPAL_UNLIKELY(NULL == context)) { + return NULL; } - /* done */ - return OMPI_SUCCESS; + context->inbuf = inbuf; + context->outbuf = outbuf; + context->count = count; + context->op = op; + context->cid_context = cid_context; + + return context; } /**************************************************************************/ /**************************************************************************/ /**************************************************************************/ static int ompi_comm_allreduce_intra_nb (int *inbuf, int *outbuf, int count, struct ompi_op_t *op, - ompi_comm_cid_context_t *context, ompi_request_t **req) + ompi_cid_base_cid_context_t *context, ompi_request_t **req) { ompi_communicator_t *comm = context->comm; @@ -639,7 +193,7 @@ static int ompi_comm_allreduce_inter_bcast (ompi_comm_request_t *request); static int ompi_comm_allreduce_inter_nb (int *inbuf, int *outbuf, int count, struct ompi_op_t *op, - ompi_comm_cid_context_t *cid_context, + ompi_cid_base_cid_context_t *cid_context, ompi_request_t **req) { ompi_communicator_t *intercomm = cid_context->comm; @@ -804,7 +358,7 @@ static int ompi_comm_allreduce_bridged_reduce_complete (ompi_comm_request_t *req static int ompi_comm_allreduce_intra_bridge_nb (int *inbuf, int *outbuf, int count, struct ompi_op_t *op, - ompi_comm_cid_context_t *cid_context, + ompi_cid_base_cid_context_t *cid_context, ompi_request_t **req) { ompi_communicator_t *comm = cid_context->comm; @@ -873,7 +427,7 @@ static int ompi_comm_allreduce_intra_bridge_nb (int *inbuf, int *outbuf, static int ompi_comm_allreduce_pmix_reduce_complete (ompi_comm_request_t *request) { ompi_comm_allreduce_context_t *context = (ompi_comm_allreduce_context_t *) request->context; - ompi_comm_cid_context_t *cid_context = context->cid_context; + ompi_cid_base_cid_context_t *cid_context = context->cid_context; int32_t size_count = context->count; opal_value_t info; opal_pmix_pdata_t pdat; @@ -960,7 +514,7 @@ static int ompi_comm_allreduce_pmix_reduce_complete (ompi_comm_request_t *reques static int ompi_comm_allreduce_intra_pmix_nb (int *inbuf, int *outbuf, int count, struct ompi_op_t *op, - ompi_comm_cid_context_t *cid_context, + ompi_cid_base_cid_context_t *cid_context, ompi_request_t **req) { ompi_communicator_t *comm = cid_context->comm; @@ -1025,7 +579,7 @@ static int ompi_comm_allreduce_intra_pmix_nb (int *inbuf, int *outbuf, static int ompi_comm_allreduce_group_broadcast (ompi_comm_request_t *request) { ompi_comm_allreduce_context_t *context = (ompi_comm_allreduce_context_t *) request->context; - ompi_comm_cid_context_t *cid_context = context->cid_context; + ompi_cid_base_cid_context_t *cid_context = context->cid_context; ompi_request_t *subreq[2]; int subreq_count = 0; int rc; @@ -1047,7 +601,7 @@ static int ompi_comm_allreduce_group_broadcast (ompi_comm_request_t *request) static int ompi_comm_allreduce_group_recv_complete (ompi_comm_request_t *request) { ompi_comm_allreduce_context_t *context = (ompi_comm_allreduce_context_t *) request->context; - ompi_comm_cid_context_t *cid_context = context->cid_context; + ompi_cid_base_cid_context_t *cid_context = context->cid_context; int *tmp = context->tmpbuf; ompi_request_t *subreq[2]; int rc; @@ -1082,7 +636,7 @@ static int ompi_comm_allreduce_group_recv_complete (ompi_comm_request_t *request } static int ompi_comm_allreduce_group_nb (int *inbuf, int *outbuf, int count, - struct ompi_op_t *op, ompi_comm_cid_context_t *cid_context, + struct ompi_op_t *op, ompi_cid_base_cid_context_t *cid_context, ompi_request_t **req) { ompi_group_t *group = cid_context->newcomm->c_local_group; @@ -1144,3 +698,187 @@ static int ompi_comm_allreduce_group_nb (int *inbuf, int *outbuf, int count, return OMPI_SUCCESS; } +/**************************************************************************/ +/**************************************************************************/ +/**************************************************************************/ +/* This routine serves two purposes: + * - the allreduce acts as a kind of Barrier, + * which avoids, that we have incoming fragments + * on the new communicator before everybody has set + * up the comm structure. + * - some components (e.g. the collective MagPIe component + * might want to generate new communicators and communicate + * using the new comm. Thus, it can just be called after + * the 'barrier'. + * + * The reason that this routine is in comm_cid and not in + * comm.c is, that this file contains the allreduce implementations + * which are required, and thus we avoid having duplicate code... + */ + +/* Non-blocking version of ompi_comm_activate */ +static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request); + +int ompi_cid_base_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, + const void *arg1, bool send_first, int mode, ompi_request_t **req) +{ + ompi_cid_base_cid_context_t *context; + ompi_comm_request_t *request; + ompi_request_t *subreq; + int ret = 0; + + context = OBJ_NEW(ompi_cid_base_cid_context_t); + if (NULL == context) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + ret = ompi_cid_base_cid_context_init (context, *newcomm, comm, bridgecomm, arg0, arg1, "activate", + send_first, mode); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OBJ_RELEASE(context); + return ret; + } + + /* keep track of the pointer so it can be set to MPI_COMM_NULL on failure */ + context->newcommp = newcomm; + + request = ompi_comm_request_get (); + if (NULL == request) { + OBJ_RELEASE(context); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + request->context = &context->super; + + if (MPI_UNDEFINED != (*newcomm)->c_local_group->grp_my_rank) { + /* Initialize the PML stuff in the newcomm */ + if ( OMPI_SUCCESS != (ret = MCA_PML_CALL(add_comm(*newcomm))) ) { + OBJ_RELEASE(*newcomm); + OBJ_RELEASE(context); + *newcomm = MPI_COMM_NULL; + return ret; + } + OMPI_COMM_SET_PML_ADDED(*newcomm); + } + + /* Step 1: the barrier, after which it is allowed to + * send messages over the new communicator + */ + ret = context->allreduce_fn (&context->ok, &context->ok, 1, MPI_MIN, context, + &subreq); + if (OMPI_SUCCESS != ret) { + ompi_comm_request_return (request); + return ret; + } + + ompi_comm_request_schedule_append (request, ompi_comm_activate_nb_complete, &subreq, 1); + ompi_comm_request_start (request); + + *req = &request->super; + + return OMPI_SUCCESS; +} + +int ompi_cid_base_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, + const void *arg1, bool send_first, int mode) +{ + ompi_request_t *req; + int rc; + + rc = ompi_cid_base_comm_activate_nb (newcomm, comm, bridgecomm, arg0, arg1, send_first, mode, &req); + if (OMPI_SUCCESS != rc) { + return rc; + } + + ompi_request_wait_completion (req); + rc = req->req_status.MPI_ERROR; + ompi_comm_request_return ((ompi_comm_request_t *) req); + + return rc; +} + +static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request) +{ + ompi_cid_base_cid_context_t *context = (ompi_cid_base_cid_context_t *) request->context; + int ret; + + /** + * Check to see if this process is in the new communicator. + * + * Specifically, this function is invoked by all proceses in the + * old communicator, regardless of whether they are in the new + * communicator or not. This is because it is far simpler to use + * MPI collective functions on the old communicator to determine + * some data for the new communicator (e.g., remote_leader) than + * to kludge up our own pseudo-collective routines over just the + * processes in the new communicator. Hence, *all* processes in + * the old communicator need to invoke this function. + * + * That being said, only processes in the new communicator need to + * select a coll module for the new communicator. More + * specifically, proceses who are not in the new communicator + * should *not* select a coll module -- for example, + * ompi_comm_rank(newcomm) returns MPI_UNDEFINED for processes who + * are not in the new communicator. This can cause errors in the + * selection / initialization of a coll module. Plus, it's + * wasteful -- processes in the new communicator will end up + * freeing the new communicator anyway, so we might as well leave + * the coll selection as NULL (the coll base comm unselect code + * handles that case properly). + */ + if (MPI_UNDEFINED == (context->newcomm)->c_local_group->grp_my_rank) { + return OMPI_SUCCESS; + } + + /* Let the collectives components fight over who will do + collective on this new comm. */ + if (OMPI_SUCCESS != (ret = mca_coll_base_comm_select(context->newcomm))) { + OBJ_RELEASE(context->newcomm); + *context->newcommp = MPI_COMM_NULL; + return ret; + } + + /* For an inter communicator, we have to deal with the potential + * problem of what is happening if the local_comm that we created + * has a lower CID than the parent comm. This is not a problem + * as long as the user calls MPI_Comm_free on the inter communicator. + * However, if the communicators are not freed by the user but released + * by Open MPI in MPI_Finalize, we walk through the list of still available + * communicators and free them one by one. Thus, local_comm is freed before + * the actual inter-communicator. However, the local_comm pointer in the + * inter communicator will still contain the 'previous' address of the local_comm + * and thus this will lead to a segmentation violation. In order to prevent + * that from happening, we increase the reference counter local_comm + * by one if its CID is lower than the parent. We cannot increase however + * its reference counter if the CID of local_comm is larger than + * the CID of the inter communicators, since a regular MPI_Comm_free would + * leave in that the case the local_comm hanging around and thus we would not + * recycle CID's properly, which was the reason and the cause for this trouble. + */ + if (OMPI_COMM_IS_INTER(context->newcomm)) { + if (OMPI_COMM_CID_IS_LOWER(context->newcomm, context->comm)) { + OMPI_COMM_SET_EXTRA_RETAIN (context->newcomm); + OBJ_RETAIN (context->newcomm); + } + } + + /* done */ + return OMPI_SUCCESS; +} + + +static void mca_comm_cid_context_construct (ompi_cid_base_cid_context_t *context) +{ + memset ((void *) ((intptr_t) context + sizeof (context->super)), 0, sizeof (*context) - sizeof (context->super)); +} + +static void mca_comm_cid_context_destruct (ompi_cid_base_cid_context_t *context) +{ + free (context->port_string); + free (context->pmix_tag); +} + +OBJ_CLASS_INSTANCE (ompi_cid_base_cid_context_t, opal_object_t, + mca_comm_cid_context_construct, + mca_comm_cid_context_destruct); diff --git a/ompi/mca/cid/basic/Makefile.am b/ompi/mca/cid/basic/Makefile.am new file mode 100644 index 00000000000..13c33fa94a9 --- /dev/null +++ b/ompi/mca/cid/basic/Makefile.am @@ -0,0 +1,35 @@ +# +# Copyright (c) 2018 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + cid_basic_component.c \ + cid_basic.h \ + cid_basic.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_ompi_cid_basic_DSO +component_noinst = +component_install = mca_cid_basic.la +else +component_noinst = libmca_cid_basic.la +component_install = +endif + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_cid_basic_la_SOURCES = $(sources) +mca_cid_basic_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_cid_basic_la_SOURCES = $(sources) +libmca_cid_basic_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/cid/basic/cid_basic.c b/ompi/mca/cid/basic/cid_basic.c new file mode 100644 index 00000000000..09c8a0565c0 --- /dev/null +++ b/ompi/mca/cid/basic/cid_basic.c @@ -0,0 +1,316 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2017 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007 Voltaire All rights reserved. + * Copyright (c) 2006-2010 University of Houston. All rights reserved. + * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2016 IBM Corporation. All rights reserved. + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "opal/dss/dss.h" +#include "opal/mca/pmix/pmix.h" + +#include "ompi/proc/proc.h" +#include "ompi/communicator/communicator.h" +#include "ompi/op/op.h" +#include "ompi/constants.h" +#include "opal/class/opal_pointer_array.h" +#include "opal/class/opal_list.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/rte/rte.h" +#include "ompi/mca/coll/base/base.h" +#include "ompi/request/request.h" +#include "ompi/runtime/mpiruntime.h" +#include "ompi/mca/cid/base/base.h" + +#include "cid_basic.h" + +static int cid_basic_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, + bool send_first, int mode, ompi_request_t **req); + +static int cid_basic_nextcid (ompi_communicator_t *newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, + bool send_first, int mode); + +static int cid_basic_release_nb (int cid, ompi_request_t **req); + +static int cid_basic_release (int cid); + +static int cid_basic_init (void); + +ompi_cid_base_module_t ompi_cid_basic_module = { + .nextcid_nb = cid_basic_nextcid_nb, + .nextcid = cid_basic_nextcid, + .activate_nb= ompi_cid_base_comm_activate_nb, + .activate= ompi_cid_base_comm_activate, + .release_nb = cid_basic_release_nb, + .release = cid_basic_release, + .init = cid_basic_init +}; + +static opal_mutex_t ompi_cid_lock = OPAL_MUTEX_STATIC_INIT; + +static int cid_basic_init (void) +{ + return OMPI_SUCCESS; +} + +static int cid_basic_release_nb(int cid, ompi_request_t **req) { + *req = &ompi_request_empty; + return OMPI_SUCCESS; +} + +static int cid_basic_release(int cid) { + return OMPI_SUCCESS; +} + +/* find the next available local cid and start an allreduce */ +static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request); +/* verify that the maximum cid is locally available and start an allreduce */ +static int ompi_comm_checkcid (ompi_comm_request_t *request); +/* verify that the cid was available globally */ +static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request); + +static volatile int64_t ompi_comm_cid_lowest_id = INT64_MAX; + +static int cid_basic_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, + bool send_first, int mode, ompi_request_t **req) +{ + ompi_cid_base_cid_context_t *context; + ompi_comm_request_t *request; + int ret; + + context = OBJ_NEW(ompi_cid_base_cid_context_t); + if (OPAL_UNLIKELY(NULL == context)) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + ret = ompi_cid_base_cid_context_init (context, newcomm, comm, bridgecomm, arg0, arg1, + "nextcid", send_first, mode); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OBJ_RELEASE(context); + return ret; + } + + context->start = ompi_mpi_communicators.lowest_free; + + request = ompi_comm_request_get (); + if (NULL == request) { + OBJ_RELEASE(context); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + request->context = &context->super; + + ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0); + ompi_comm_request_start (request); + + *req = &request->super; + + + return OMPI_SUCCESS; +} + +int cid_basic_nextcid (ompi_communicator_t *newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, + bool send_first, int mode) +{ + ompi_request_t *req; + int rc; + + rc = cid_basic_nextcid_nb (newcomm, comm, bridgecomm, arg0, arg1, send_first, mode, &req); + if (OMPI_SUCCESS != rc) { + return rc; + } + + ompi_request_wait_completion (req); + rc = req->req_status.MPI_ERROR; + ompi_comm_request_return ((ompi_comm_request_t *) req); + + return rc; +} + +static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request) +{ + ompi_cid_base_cid_context_t *context = (ompi_cid_base_cid_context_t *) request->context; + int64_t my_id = ((int64_t) ompi_comm_get_cid (context->comm) << 32 | context->pml_tag); + ompi_request_t *subreq; + bool flag; + int ret; + int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED); + + if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { + return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0); + } + + if (ompi_comm_cid_lowest_id < my_id) { + OPAL_THREAD_UNLOCK(&ompi_cid_lock); + return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0); + } + + ompi_comm_cid_lowest_id = my_id; + + /** + * This is the real algorithm described in the doc + */ + if( participate ){ + flag = false; + context->nextlocal_cid = mca_pml.pml_max_contextid; + for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) { + flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, + context->comm); + if (true == flag) { + context->nextlocal_cid = i; + break; + } + } + } else { + context->nextlocal_cid = 0; + } + + ret = context->allreduce_fn (&context->nextlocal_cid, &context->nextcid, 1, MPI_MAX, + context, &subreq); + /* there was a failure during non-blocking collective + * all we can do is abort + */ + if (OMPI_SUCCESS != ret) { + goto err_exit; + } + + if ( ((unsigned int) context->nextlocal_cid == mca_pml.pml_max_contextid) ) { + /* Our local CID space is out, others already aware (allreduce above) */ + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto err_exit; + } + OPAL_THREAD_UNLOCK(&ompi_cid_lock); + + /* next we want to verify that the resulting commid is ok */ + return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, &subreq, 1); +err_exit: + if (participate && flag) { + opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL); + } + ompi_comm_cid_lowest_id = INT64_MAX; + OPAL_THREAD_UNLOCK(&ompi_cid_lock); + return ret; + +} + +static int ompi_comm_checkcid (ompi_comm_request_t *request) +{ + ompi_cid_base_cid_context_t *context = (ompi_cid_base_cid_context_t *) request->context; + ompi_request_t *subreq; + int ret; + int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED); + + if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { + return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0); + } + + if( !participate ){ + context->flag = 1; + } else { + context->flag = (context->nextcid == context->nextlocal_cid); + if ( participate && !context->flag) { + opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL); + + context->flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, + context->nextcid, context->comm); + } + } + + ++context->iter; + + ret = context->allreduce_fn (&context->flag, &context->rflag, 1, MPI_MIN, context, &subreq); + if (OMPI_SUCCESS == ret) { + ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, &subreq, 1); + } else { + if (participate && context->flag ) { + opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL); + } + ompi_comm_cid_lowest_id = INT64_MAX; + } + + OPAL_THREAD_UNLOCK(&ompi_cid_lock); + return ret; +} + +static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request) +{ + ompi_cid_base_cid_context_t *context = (ompi_cid_base_cid_context_t *) request->context; + int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED); + + if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { + return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0); + } + + if (1 == context->rflag) { + if( !participate ) { + /* we need to provide something sane here + * but we cannot use `nextcid` as we may have it + * in-use, go ahead with next locally-available CID + */ + context->nextlocal_cid = mca_pml.pml_max_contextid; + for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) { + bool flag; + flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, + context->comm); + if (true == flag) { + context->nextlocal_cid = i; + break; + } + } + context->nextcid = context->nextlocal_cid; + } + + /* set the according values to the newcomm */ + context->newcomm->c_contextid = context->nextcid; + opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, context->newcomm); + + /* unlock the cid generator */ + ompi_comm_cid_lowest_id = INT64_MAX; + OPAL_THREAD_UNLOCK(&ompi_cid_lock); + + /* done! */ + return OMPI_SUCCESS; + } + + if (participate && (1 == context->flag)) { + /* we could use this cid, but other don't agree */ + opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, NULL); + context->start = context->nextcid + 1; /* that's where we can start the next round */ + } + + ++context->iter; + + OPAL_THREAD_UNLOCK(&ompi_cid_lock); + + /* try again */ + return ompi_comm_allreduce_getnextcid (request); +} + diff --git a/ompi/mca/cid/basic/cid_basic.h b/ompi/mca/cid/basic/cid_basic.h new file mode 100644 index 00000000000..79e08f48ec0 --- /dev/null +++ b/ompi/mca/cid/basic/cid_basic.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef _MCA_CID_BASIC_H_ +#define _MCA_CID_BASIC_H_ + +#include "ompi_config.h" + +#include "opal/mca/base/base.h" +#include "ompi/mca/cid/cid.h" + + +BEGIN_C_DECLS + +ORTE_MODULE_DECLSPEC extern ompi_cid_base_component_t mca_cid_basic_component; +extern ompi_cid_base_module_t ompi_cid_basic_module; + +END_C_DECLS + +#endif /* MCA_CID_BASIC_H_ */ diff --git a/ompi/mca/cid/basic/cid_basic_component.c b/ompi/mca/cid/basic/cid_basic_component.c new file mode 100644 index 00000000000..b3f34094258 --- /dev/null +++ b/ompi/mca/cid/basic/cid_basic_component.c @@ -0,0 +1,46 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/types.h" +#include "opal/types.h" + +#include "opal/util/show_help.h" + +#include "ompi/mca/cid/cid.h" + +#include "cid_basic.h" + +static int ompi_cid_basic_query(ompi_cid_base_module_t **module, int *priority, bool ompi_mpi_thread_multiple); + +/* + * Struct of function pointers and all that to let us be initialized + */ +ompi_cid_base_component_t mca_cid_basic_component = { + .base_version = { + OMPI_CID_BASE_VERSION_1_0_0, + .mca_component_name = "basic", + MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION), + }, + .base_data = { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + .query = ompi_cid_basic_query, +}; + +static int ompi_cid_basic_query(ompi_cid_base_module_t **module, int *priority, bool ompi_mpi_thread_multiple) +{ + *module = &ompi_cid_basic_module; + *priority = 10; + return ORTE_SUCCESS; +} diff --git a/ompi/mca/cid/basic/owner.txt b/ompi/mca/cid/basic/owner.txt new file mode 100644 index 00000000000..e6150b6b0fc --- /dev/null +++ b/ompi/mca/cid/basic/owner.txt @@ -0,0 +1,7 @@ +# +# owner/status file +# owner: institution that is responsible for this package +# status: e.g. active, maintenance, unmaintained +# +owner: project +status: active diff --git a/ompi/mca/cid/cid.h b/ompi/mca/cid/cid.h new file mode 100644 index 00000000000..641eb3e79e3 --- /dev/null +++ b/ompi/mca/cid/cid.h @@ -0,0 +1,171 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2017 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007 Voltaire All rights reserved. + * Copyright (c) 2006-2010 University of Houston. All rights reserved. + * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2016 IBM Corporation. All rights reserved. + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_CID_H +#define MCA_CID_H + +#include "ompi_config.h" + +#include "opal/class/opal_object.h" +#include "ompi/mca/mca.h" + +#include "ompi/types.h" +#include "ompi/communicator/communicator.h" +#include "ompi/request/request.h" + +BEGIN_C_DECLS + +/** + * Pre-declare this so that we can pass it as an argument to the + * typedef'ed functions. + */ +struct ompi_cid_base_module_1_0_0_t; + +typedef struct ompi_cid_base_module_1_0_0_t ompi_cid_base_module_t; + +/** + * allocate new communicator ID (non-blocking) + * @param newcomm: pointer to the new communicator + * @param oldcomm: original comm + * @param bridgecomm: bridge comm for intercomm_create + * @param mode: combination of input + * OMPI_COMM_CID_INTRA: intra-comm + * OMPI_COMM_CID_INTER: inter-comm + * This routine has to be thread safe in the final version. + */ +typedef int (*ompi_cid_base_module_nextcid_nb_fn_t) (ompi_communicator_t *newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, + bool send_first, int mode, ompi_request_t **req); + +/** + * allocate new communicator ID + * @param newcomm: pointer to the new communicator + * @param oldcomm: original comm + * @param bridgecomm: bridge comm for intercomm_create + * @param mode: combination of input + * OMPI_COMM_CID_INTRA: intra-comm + * OMPI_COMM_CID_INTER: inter-comm + * OMPI_COMM_CID_GROUP: only decide CID within the ompi_group_t + * associated with the communicator. arg0 + * must point to an int which will be used + * as the pml tag for communication. + * OMPI_COMM_CID_INTRA_BRIDGE: 2 intracomms connected by + * a bridge comm. arg0 and arg1 must point + * to integers representing the local and + * remote leader ranks. the remote leader rank + * is a rank in the bridgecomm. + * OMPI_COMM_CID_INTRA_PMIX: 2 intracomms, leaders talk + * through PMIx. arg0 must point to an integer + * representing the local leader rank. arg1 + * must point to a string representing the + * port of the remote leader. + * @param send_first: to avoid a potential deadlock for + * the OOB version. + * This routine has to be thread safe in the final version. + */ +typedef int (*ompi_cid_base_module_nextcid_fn_t) (ompi_communicator_t *newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, + bool send_first, int mode); + +/* This routine serves two purposes: + * - the allreduce acts as a kind of Barrier, + * which avoids, that we have incoming fragments + * on the new communicator before everybody has set + * up the comm structure. + * - some components (e.g. the collective MagPIe component + * might want to generate new communicators and communicate + * using the new comm. Thus, it can just be called after + * the 'barrier'. + * + * The reason that this routine is in comm_cid and not in + * comm.c is, that this file contains the allreduce implementations + * which are required, and thus we avoid having duplicate code... + */ +typedef int (*ompi_cid_base_module_activate_nb_fn_t) (ompi_communicator_t **newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, + const void *arg1, bool send_first, int mode, ompi_request_t **req); + +typedef int (*ompi_cid_base_module_activate_fn_t) (ompi_communicator_t **newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, + const void *arg1, bool send_first, int mode); + +typedef int (*ompi_cid_base_module_release_nb_fn_t) (int cid, ompi_request_t **req); + +typedef int (*ompi_cid_base_module_release_fn_t) (int cid); + +/* initialize the module - allow it to do whatever one-time + * things it requires */ +typedef int (*ompi_cid_base_module_init_fn_t)(void); + +/* give the component a chance to cleanup */ +typedef void (*ompi_cid_base_module_finalize_fn_t)(void); + +/** + * Module struct + */ +typedef struct ompi_cid_base_module_1_0_0_t { + ompi_cid_base_module_nextcid_nb_fn_t nextcid_nb; + ompi_cid_base_module_nextcid_fn_t nextcid; + ompi_cid_base_module_activate_nb_fn_t activate_nb; + ompi_cid_base_module_activate_fn_t activate; + ompi_cid_base_module_release_nb_fn_t release_nb; + ompi_cid_base_module_release_fn_t release; + ompi_cid_base_module_init_fn_t init; + ompi_cid_base_module_finalize_fn_t finalize; +} ompi_cid_base_module_1_0_0_t; + +OMPI_DECLSPEC extern ompi_cid_base_module_t *ompi_cid; + +typedef int (*ompi_cid_base_component_query_t)(ompi_cid_base_module_t ** module, int *priority, bool ompi_mpi_thread_multiple); + +typedef struct ompi_cid_base_component_1_0_0_t { + mca_base_component_t base_version; + mca_base_component_data_t base_data; + ompi_cid_base_component_query_t query; +} ompi_cid_base_component_1_0_0_t; + + +/** Per guidence in mca.h, use the unversioned struct name if you just + want to always keep up with the most recent version of the + interace. */ +typedef struct ompi_cid_base_component_1_0_0_t ompi_cid_base_component_t; + +/** + * Struct that is used in op.h to hold all the function pointers and + * pointers to the corresopnding modules (so that we can properly + * RETAIN/RELEASE them) + */ +#define OMPI_CID_BASE_VERSION_1_0_0 \ + OMPI_MCA_BASE_VERSION_2_1_0("cid", 1, 0, 0) + +END_C_DECLS + +#endif /* OMPI_MCA_CID_H */ diff --git a/ompi/mca/cid/pmix/Makefile.am b/ompi/mca/cid/pmix/Makefile.am new file mode 100644 index 00000000000..7151d2b0b6c --- /dev/null +++ b/ompi/mca/cid/pmix/Makefile.am @@ -0,0 +1,35 @@ +# +# Copyright (c) 2018 Research Organization for Information Science +# and Technology (RIST). All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +sources = \ + cid_pmix_component.c \ + cid_pmix.h \ + cid_pmix.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_ompi_cid_pmix_DSO +component_noinst = +component_install = mca_cid_pmix.la +else +component_noinst = libmca_cid_pmix.la +component_install = +endif + +mcacomponentdir = $(ompilibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_cid_pmix_la_SOURCES = $(sources) +mca_cid_pmix_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_cid_pmix_la_SOURCES = $(sources) +libmca_cid_pmix_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/cid/pmix/cid_pmix.c b/ompi/mca/cid/pmix/cid_pmix.c new file mode 100644 index 00000000000..08c7ded4efa --- /dev/null +++ b/ompi/mca/cid/pmix/cid_pmix.c @@ -0,0 +1,546 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2017 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2008 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007 Voltaire All rights reserved. + * Copyright (c) 2006-2010 University of Houston. All rights reserved. + * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2016 IBM Corporation. All rights reserved. + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "opal/dss/dss.h" +#include "opal/mca/pmix/pmix.h" + +#include "orte/runtime/orte_wait.h" + +#include "ompi/proc/proc.h" +#include "ompi/communicator/communicator.h" +#include "ompi/op/op.h" +#include "ompi/constants.h" +#include "opal/class/opal_pointer_array.h" +#include "opal/class/opal_list.h" +#include "ompi/mca/pml/pml.h" +#include "ompi/mca/rte/rte.h" +#include "ompi/mca/coll/base/base.h" +#include "ompi/request/request.h" +#include "ompi/runtime/mpiruntime.h" +#include "ompi/mca/cid/base/base.h" + +#include "cid_pmix.h" + +static int cid_pmix_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, + bool send_first, int mode, ompi_request_t **req); + +static int cid_pmix_nextcid (ompi_communicator_t *newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, + bool send_first, int mode); + +static int cid_pmix_release_nb (int cid, ompi_request_t **req); + +static int cid_pmix_release (int cid); + +static int cid_pmix_init (void); + +ompi_cid_base_module_t ompi_cid_pmix_module = { + .nextcid_nb = cid_pmix_nextcid_nb, + .nextcid = cid_pmix_nextcid, + .activate_nb= ompi_cid_base_comm_activate_nb, + .activate= ompi_cid_base_comm_activate, + .release_nb = cid_pmix_release_nb, + .release = cid_pmix_release, + .init = cid_pmix_init +}; + +static opal_mutex_t ompi_cid_lock = OPAL_MUTEX_STATIC_INIT; + +struct participant_t { + opal_namelist_t super; + int rank; + int participate; +}; + +typedef struct participant_t participant_t; + +static void participant_construct(participant_t * participant) { + participant->rank = MPI_UNDEFINED; + participant->participate = false; +} + +static OBJ_CLASS_INSTANCE(participant_t, opal_namelist_t, participant_construct, NULL); + +struct pmix_context_t { + ompi_cid_base_cid_context_t super; + int participate; + int master; + opal_list_t localprocs; + int children; +}; + +typedef struct pmix_context_t pmix_context_t; + +static void pmix_context_construct (pmix_context_t *context) +{ + OBJ_CONSTRUCT(&context->localprocs, opal_list_t); + context->participate = true; + context->master = MPI_UNDEFINED; + context->children = 0; +} + +static void pmix_context_destruct (pmix_context_t *context) +{ + OPAL_LIST_DESTRUCT(&context->localprocs); +} + +static OBJ_CLASS_INSTANCE (pmix_context_t, ompi_cid_base_cid_context_t, + pmix_context_construct, + pmix_context_destruct); + +int cid_pmix_init (void) +{ + return OMPI_SUCCESS; +} + +/* find the next available local cid and start an allreduce */ +static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request); +static int ompi_comm_allreduce_getlocalprocs (ompi_comm_request_t *request); +static int ompi_comm_allreduce_nextlocal_cid (ompi_comm_request_t *request); +/* verify that the maximum cid is locally available and start an allreduce */ +static int ompi_comm_checkcid (ompi_comm_request_t *request); +static int ompi_comm_checkcid2 (ompi_comm_request_t *request); +/* verify that the cid was available globally */ +static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request); +static int ompi_comm_nextcid_setcid (ompi_comm_request_t *request); + +static volatile int64_t ompi_comm_cid_lowest_id = INT64_MAX; + +static int cid_pmix_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, + bool send_first, int mode, ompi_request_t **req) +{ + pmix_context_t *context; + ompi_comm_request_t *request; + ompi_request_t **reqs = NULL; + int ret; + + context = OBJ_NEW(pmix_context_t); + if (NULL == context) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + ret = ompi_cid_base_cid_context_init (&context->super, newcomm, comm, bridgecomm, arg0, arg1, + "nextcid", send_first, mode); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OBJ_RELEASE(context); + return ret; + } + + context->super.start = 3; + context->super.nextlocal_cid = 0; + context->participate = (context->super.newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED)?1:0; + + request = ompi_comm_request_get (); + if (NULL == request) { + OBJ_RELEASE(context); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + request->context = &context->super.super; + context->master = -1; + + + if (OMPI_COMM_IS_INTER(context->super.comm)) { + context->children = 0; + participant_t *p = OBJ_NEW(participant_t); + p->super.name = OPAL_PROC_MY_NAME; + p->rank = ompi_comm_rank(context->super.comm); + p->participate = context->participate; + opal_list_append(&context->localprocs, &p->super.super); + ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0); + } else { + ompi_group_t *group = context->super.comm->c_local_group; + /* am I the master (e.g.) task with the lowest rank on this node */ + for (int i=0; isuper.comm); i++) { + if (!ompi_proc_is_sentinel(group->grp_proc_pointers[i]) && + group->grp_proc_pointers[i]->super.proc_flags & OPAL_PROC_ON_NODE) { + context->master = i; + break; + } + } + if (0 > context->master) { + context->children = 0; + for (int i=ompi_comm_rank(context->super.comm)+1; igrp_proc_count; i++) { + if (!ompi_proc_is_sentinel(group->grp_proc_pointers[i]) && + group->grp_proc_pointers[i]->super.proc_flags & OPAL_PROC_ON_NODE) { + context->children++; + } + } + if (context->children > 0) { + int j=0; + reqs = (ompi_request_t **)alloca(context->children * sizeof(ompi_request_t *)); + for (int i=ompi_comm_rank(context->super.comm)+1; igrp_proc_count; i++) { + if (!ompi_proc_is_sentinel(group->grp_proc_pointers[i]) && + group->grp_proc_pointers[i]->super.proc_flags & OPAL_PROC_ON_NODE) { + participant_t *p = OBJ_NEW(participant_t); + p->super.name = group->grp_proc_pointers[i]->super.proc_name; + p->rank = i; + MCA_PML_CALL(irecv(&p->participate, 1, MPI_INT, i, OMPI_COMM_LOCAL_TAG, context->super.comm, reqs+j)); + opal_list_append(&context->localprocs, &p->super.super); + j++; + } + } + assert (j == context->children); + } + ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getlocalprocs, reqs, context->children); + } else { + /* I am not the master, tell the master whether I participate or not */ + reqs = (ompi_request_t **)alloca(sizeof(ompi_request_t *)); + MCA_PML_CALL(isend(&context->participate, 1, MPI_INT, context->master, OMPI_COMM_LOCAL_TAG, MCA_PML_BASE_SEND_STANDARD, context->super.comm, reqs)); + ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, reqs, 1); + } + } + ompi_comm_request_start (request); + + *req = &request->super; + + + return OMPI_SUCCESS; +} + +int cid_pmix_nextcid (ompi_communicator_t *newcomm, ompi_communicator_t *comm, + ompi_communicator_t *bridgecomm, const void *arg0, const void *arg1, + bool send_first, int mode) +{ + ompi_request_t *req; + int rc; + + rc = cid_pmix_nextcid_nb (newcomm, comm, bridgecomm, arg0, arg1, send_first, mode, &req); + if (OMPI_SUCCESS != rc) { + return rc; + } + + ompi_request_wait_completion (req); + rc = req->req_status.MPI_ERROR; + ompi_comm_request_return ((ompi_comm_request_t *) req); + + return rc; +} + +struct ompi_request_cid_t { + ompi_request_t super; + pmix_context_t *context; +}; + +typedef struct ompi_request_cid_t ompi_request_cid_t; + +static int reqcid_free(ompi_request_t **req) { + OBJ_RELEASE(*req); + return OMPI_SUCCESS; +} + +static int reqcid_cancel(ompi_request_t *req, int complete) { + return OMPI_ERROR; +} + +static void reqcid_cons(ompi_request_cid_t *req) { + req->super.req_type = OMPI_REQUEST_PMIX; + req->super.req_free = reqcid_free; + req->super.req_cancel = reqcid_cancel; + req->super.req_state = OMPI_REQUEST_ACTIVE; +} + +static OBJ_CLASS_INSTANCE(ompi_request_cid_t, ompi_request_t, reqcid_cons, NULL); + +static void getnextcid_cbfunc(int status, int cid, void *cbdata) { + ompi_request_cid_t *req = (ompi_request_cid_t *)cbdata; + if (OMPI_SUCCESS == status) { + req->context->super.nextlocal_cid = cid; + OMPI_REQUEST_FINI(&req->super); + ompi_request_complete(&req->super, true); + } +} + +static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request) +{ + pmix_context_t *context = (pmix_context_t *) request->context; + int64_t my_id = ((int64_t) ompi_comm_get_cid (context->super.comm) << 32 | context->super.pml_tag); + ompi_request_t *subreq = NULL; +#if 0 + bool flag; + int ret; +#endif + + if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { + return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0); + } + + if (ompi_comm_cid_lowest_id < my_id) { + OPAL_THREAD_UNLOCK(&ompi_cid_lock); + return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0); + } + + ompi_comm_cid_lowest_id = my_id; + + if (0 <= context->master) { + context->super.nextlocal_cid = 0; + return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_nextlocal_cid, NULL, 0); + } + /** + * This is the real algorithm described in the doc + */ + if (0 < opal_list_get_size(&context->localprocs)) { + if (context->super.nextlocal_cid < context->super.start) { + ompi_request_cid_t *req = OBJ_NEW(ompi_request_cid_t); + subreq = &req->super; + // context->nextlocal_cid = mca_pml.pml_max_contextid; + req->context = context; + opal_pmix.cid_nb(&context->localprocs, context->super.start, context->super.nextlocal_cid, getnextcid_cbfunc, req); + } + } else { + context->super.nextlocal_cid = 0; + } + return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_nextlocal_cid, &subreq, (NULL==subreq)?0:1); +} + +static int ompi_comm_allreduce_getlocalprocs (ompi_comm_request_t *request) +{ + pmix_context_t *context = (pmix_context_t *) request->context; + participant_t *proc, *next; + + + OPAL_LIST_FOREACH_SAFE(proc, next, &context->localprocs, participant_t ) { + if (!proc->participate) { + opal_list_remove_item(&context->localprocs, &proc->super.super); + context->children --; + } + } + + if (context->participate) { + participant_t *proc = OBJ_NEW(participant_t); + proc->super.name = OPAL_PROC_MY_NAME; + proc->rank = ompi_comm_rank(context->super.comm); + proc->participate = 1; + opal_list_append(&context->localprocs, &proc->super.super); + } + + return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_getnextcid, NULL, 0); + +} + +static int ompi_comm_allreduce_nextlocal_cid (ompi_comm_request_t *request) +{ + pmix_context_t *context = (pmix_context_t *) request->context; + int ret; + + ompi_request_t *subreq; + ret = context->super.allreduce_fn (&context->super.nextlocal_cid, &context->super.nextcid, 1, MPI_MAX, + &context->super, &subreq); + /* there was a failure during non-blocking collective + * all we can do is abort + */ + if (OMPI_SUCCESS != ret) { + goto err_exit; + } + + if ( ((unsigned int) context->super.nextlocal_cid == mca_pml.pml_max_contextid) ) { + /* Our local CID space is out, others already aware (allreduce above) */ + ret = OMPI_ERR_OUT_OF_RESOURCE; + goto err_exit; + } + OPAL_THREAD_UNLOCK(&ompi_cid_lock); + + /* next we want to verify that the resulting commid is ok */ + return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, &subreq, 1); +err_exit: + if (context->participate && context->super.nextlocal_cid != mca_pml.pml_max_contextid) { + opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->super.nextlocal_cid, NULL); + } + ompi_comm_cid_lowest_id = INT64_MAX; + OPAL_THREAD_UNLOCK(&ompi_cid_lock); + return ret; + +} + +static void checkcid_cbfunc(int status, int cid, void *cbdata) { + ompi_request_cid_t *req = (ompi_request_cid_t *)cbdata; + if (OMPI_SUCCESS == status) { + if (cid == req->context->super.nextcid) { + req->context->super.flag = 1; + } + req->context->super.nextlocal_cid = cid; + OMPI_REQUEST_FINI(&req->super); + ompi_request_complete(&req->super, true); + } +} + +static int ompi_comm_checkcid (ompi_comm_request_t *request) +{ + pmix_context_t *context = (pmix_context_t *) request->context; + ompi_request_t *subreq = NULL; + + if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { + return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0); + } + + if (0 <= context->master || 0 == opal_list_get_size(&context->localprocs)) { + context->super.flag = 1; + } else { + context->super.flag = (context->super.nextcid == context->super.nextlocal_cid); + if (!context->super.flag) { + ompi_request_cid_t *req = OBJ_NEW(ompi_request_cid_t); + subreq = &req->super; + req->context = context; + opal_pmix.cid_nb(&context->localprocs, context->super.nextcid, context->super.nextlocal_cid, checkcid_cbfunc, req); + } + } + + ++context->super.iter; + + OPAL_THREAD_UNLOCK(&ompi_cid_lock); + + return ompi_comm_request_schedule_append (request, ompi_comm_checkcid2, &subreq, (NULL==subreq)?0:1); +} + +static int ompi_comm_checkcid2 (ompi_comm_request_t *request) +{ + pmix_context_t *context = (pmix_context_t *) request->context; + ompi_request_t *subreq; + int ret; + + if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { + return ompi_comm_request_schedule_append (request, ompi_comm_checkcid2, NULL, 0); + } + + ret = context->super.allreduce_fn (&context->super.flag, &context->super.rflag, 1, MPI_MIN, &context->super, &subreq); + if (OMPI_SUCCESS == ret) { + ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, &subreq, 1); + } else { + if (context->participate && context->super.flag ) { + opal_pointer_array_test_and_set_item(&ompi_mpi_communicators, context->super.nextlocal_cid, NULL); + } + ompi_comm_cid_lowest_id = INT64_MAX; + } + + OPAL_THREAD_UNLOCK(&ompi_cid_lock); + return ret; +} + +static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request) +{ + pmix_context_t *context = (pmix_context_t *) request->context; + + if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { + return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0); + } + + if (1 == context->super.rflag) { + ompi_request_t ** reqs = NULL; + int j = 0; + if (0 <= context->master) { + reqs = (ompi_request_t **)alloca(sizeof(ompi_request_t *)); + if (context->participate) { + MCA_PML_CALL(irecv(&context->super.nextlocal_cid, 1, MPI_INT, context->master, OMPI_COMM_LOCAL_TAG, + context->super.comm, reqs)); + j++; + } + } else { + if (context->children > 0) { + participant_t *p; + reqs = (ompi_request_t **)alloca(context->children * sizeof(ompi_request_t *)); + OPAL_LIST_FOREACH(p, &context->localprocs, participant_t) { + if (ompi_comm_rank(context->super.comm) != p->rank) { + MCA_PML_CALL(isend(&context->super.nextlocal_cid, 1, MPI_INT, p->rank, OMPI_COMM_LOCAL_TAG, + MCA_PML_BASE_SEND_STANDARD, context->super.comm, reqs + j)); + j++; + } + } + assert(j == context->children); + } + } + return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_setcid, reqs, j); + } + + if (context->super.flag) { + /* we could use this cid, but other don't agree */ + context->super.start = context->super.nextcid + 1; /* that's where we can start the next round */ + } + + ++context->super.iter; + + OPAL_THREAD_UNLOCK(&ompi_cid_lock); + + /* try again */ + return ompi_comm_allreduce_getnextcid (request); +} + +static int ompi_comm_nextcid_setcid (ompi_comm_request_t *request) +{ + pmix_context_t *context = (pmix_context_t *) request->context; + + if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) { + return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_setcid, NULL, 0); + } + + if( context->participate ) { + /* set the according values to the newcomm */ + context->super.newcomm->c_contextid = context->super.nextcid; + opal_pointer_array_set_item (&ompi_mpi_communicators, context->super.nextcid, context->super.newcomm); + } + + /* unlock the cid generator */ + ompi_comm_cid_lowest_id = INT64_MAX; + OPAL_THREAD_UNLOCK(&ompi_cid_lock); + + /* done! */ + return OMPI_SUCCESS; +} + +static void cid_cbfunc(int status, int cid, void *cbdata) { + bool *active = (bool *)cbdata; + *active = false; +} + +static int cid_pmix_release(int cid) +{ + opal_list_t procs; + opal_namelist_t proc; + bool active = true; + OBJ_CONSTRUCT(&procs, opal_list_t); + OBJ_CONSTRUCT(&proc, opal_namelist_t); + proc.name = OPAL_PROC_MY_NAME; + opal_list_append(&procs, &proc.super); + opal_pmix.cid_nb(&procs, 0, cid, cid_cbfunc, &active); + ORTE_WAIT_FOR_COMPLETION(active); + opal_list_remove_item(&procs, &proc.super); + OBJ_DESTRUCT(&procs); + OBJ_DESTRUCT(&proc); + return OMPI_SUCCESS; +} + +static int cid_pmix_release_nb (int cid, ompi_request_t **req) +{ + return OMPI_ERROR; +} diff --git a/ompi/mca/cid/pmix/cid_pmix.h b/ompi/mca/cid/pmix/cid_pmix.h new file mode 100644 index 00000000000..5a8dd0252d7 --- /dev/null +++ b/ompi/mca/cid/pmix/cid_pmix.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef _MCA_CID_PMIX_H_ +#define _MCA_CID_PMIX_H + +#include "ompi_config.h" + +#include "opal/mca/base/base.h" + +#include "ompi/mca/cid/cid.h" + + +BEGIN_C_DECLS + +ORTE_MODULE_DECLSPEC extern ompi_cid_base_component_t mca_cid_pmix_component; +extern ompi_cid_base_module_t ompi_cid_pmix_module; + +END_C_DECLS + +#endif /* MCA_CID_PMIX_H_ */ diff --git a/ompi/mca/cid/pmix/cid_pmix_component.c b/ompi/mca/cid/pmix/cid_pmix_component.c new file mode 100644 index 00000000000..3d61977cfce --- /dev/null +++ b/ompi/mca/cid/pmix/cid_pmix_component.c @@ -0,0 +1,46 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "orte/types.h" +#include "opal/types.h" + +#include "opal/util/show_help.h" + +#include "ompi/mca/cid/cid.h" + +#include "cid_pmix.h" + +static int ompi_cid_pmix_query(ompi_cid_base_module_t **module, int *priority, bool use_mpi_thread_multiple); + +/* + * Struct of function pointers and all that to let us be initialized + */ +ompi_cid_base_component_t mca_cid_pmix_component = { + .base_version = { + OMPI_CID_BASE_VERSION_1_0_0, + .mca_component_name = "pmix", + MCA_BASE_MAKE_VERSION(component, ORTE_MAJOR_VERSION, ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION), + }, + .base_data = { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + .query= ompi_cid_pmix_query, +}; + +static int ompi_cid_pmix_query(ompi_cid_base_module_t **module, int *priority, bool use_mpi_thread_multiple) +{ + *module = &ompi_cid_pmix_module; + *priority = 20; + return ORTE_SUCCESS; +} diff --git a/ompi/mca/cid/pmix/owner.txt b/ompi/mca/cid/pmix/owner.txt new file mode 100644 index 00000000000..987d330b0ca --- /dev/null +++ b/ompi/mca/cid/pmix/owner.txt @@ -0,0 +1,7 @@ +# +# owner/status file +# owner: institution that is responsible for this package +# status: e.g. active, maintenance, unmaintained +# +owner: RIST +status: active diff --git a/ompi/mpi/c/intercomm_create.c b/ompi/mpi/c/intercomm_create.c index e8405242dc6..971145fda4b 100644 --- a/ompi/mpi/c/intercomm_create.c +++ b/ompi/mpi/c/intercomm_create.c @@ -13,7 +13,7 @@ * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 University of Houston. All rights reserved. * Copyright (c) 2012-2013 Inria. All rights reserved. - * Copyright (c) 2014-2015 Research Organization for Information Science + * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 Los Alamos National Security, LLC. All rights * reserved. @@ -32,6 +32,8 @@ #include "ompi/mca/pml/pml.h" #include "ompi/communicator/communicator.h" #include "ompi/request/request.h" +#include "ompi/mca/cid/cid.h" +#include "ompi/mca/cid/base/base.h" #include "ompi/memchecker.h" #if OMPI_BUILD_MPI_PROFILING @@ -198,14 +200,14 @@ int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader, new_group_pointer = MPI_GROUP_NULL; /* Determine context id. It is identical to f_2_c_handle */ - rc = ompi_comm_nextcid (newcomp, local_comm, bridge_comm, &lleader, + rc = ompi_cid->nextcid (newcomp, local_comm, bridge_comm, &lleader, &rleader, false, OMPI_COMM_CID_INTRA_BRIDGE); if ( MPI_SUCCESS != rc ) { goto err_exit; } /* activate comm and init coll-module */ - rc = ompi_comm_activate (&newcomp, local_comm, bridge_comm, &lleader, &rleader, + rc = ompi_cid->activate (&newcomp, local_comm, bridge_comm, &lleader, &rleader, false, OMPI_COMM_CID_INTRA_BRIDGE); if ( MPI_SUCCESS != rc ) { goto err_exit; diff --git a/ompi/mpi/c/intercomm_merge.c b/ompi/mpi/c/intercomm_merge.c index 12107764ce3..02b6f5b1665 100644 --- a/ompi/mpi/c/intercomm_merge.c +++ b/ompi/mpi/c/intercomm_merge.c @@ -13,7 +13,7 @@ * Copyright (c) 2006-2012 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2006-2009 University of Houston. All rights reserved. * Copyright (c) 2012-2013 Inria. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science + * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 Los Alamos National Security, LLC. All rights * reserved. @@ -31,6 +31,8 @@ #include "ompi/runtime/params.h" #include "ompi/errhandler/errhandler.h" #include "ompi/communicator/communicator.h" +#include "ompi/mca/cid/cid.h" +#include "ompi/mca/cid/base/base.h" #include "ompi/proc/proc.h" #include "ompi/memchecker.h" @@ -117,14 +119,14 @@ int MPI_Intercomm_merge(MPI_Comm intercomm, int high, new_group_pointer = MPI_GROUP_NULL; /* Determine context id */ - rc = ompi_comm_nextcid (newcomp, intercomm, NULL, NULL, NULL, false, + rc = ompi_cid->nextcid (newcomp, intercomm, NULL, NULL, NULL, false, OMPI_COMM_CID_INTER); if ( OMPI_SUCCESS != rc ) { goto exit; } /* activate communicator and init coll-module */ - rc = ompi_comm_activate (&newcomp, intercomm, NULL, NULL, NULL, false, + rc = ompi_cid->activate (&newcomp, intercomm, NULL, NULL, NULL, false, OMPI_COMM_CID_INTER); if ( OMPI_SUCCESS != rc ) { goto exit; diff --git a/ompi/request/request_dbg.h b/ompi/request/request_dbg.h index 7251b96dc5b..8e4d0ba806b 100644 --- a/ompi/request/request_dbg.h +++ b/ompi/request/request_dbg.h @@ -1,6 +1,8 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ /* * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -17,7 +19,7 @@ */ /** - * Enum inidicating the type of the request + * Enum indicating the type of the request */ typedef enum { OMPI_REQUEST_PML, /**< MPI point-to-point request */ @@ -28,6 +30,7 @@ typedef enum { OMPI_REQUEST_NULL, /**< NULL request */ OMPI_REQUEST_NOOP, /**< A request that does nothing (e.g., to PROC_NULL) */ OMPI_REQUEST_COMM, /**< MPI-3 non-blocking communicator duplication */ + OMPI_REQUEST_PMIX, /**< A PMIx CID request */ OMPI_REQUEST_MAX /**< Maximum request type */ } ompi_request_type_t; diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index f49ac00eaab..725b27f6a5a 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -18,7 +18,7 @@ * Copyright (c) 2011 Sandia National Laboratories. All rights reserved. * Copyright (c) 2012-2013 Inria. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2016 Research Organization for Information Science + * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * @@ -89,6 +89,7 @@ #include "ompi/mca/io/base/base.h" #include "ompi/mca/rte/rte.h" #include "ompi/mca/rte/base/base.h" +#include "ompi/mca/cid/base/base.h" #include "ompi/debuggers/debuggers.h" #include "ompi/proc/proc.h" #include "ompi/mca/pml/base/pml_base_bsend.h" @@ -602,6 +603,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) /* Open up MPI-related MCA components */ + if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_cid_base_framework, 0))) { + error = "mca_cid_base_open() failed"; + goto error; + } if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_allocator_base_framework, 0))) { error = "mca_allocator_base_open() failed"; goto error; @@ -921,8 +926,8 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) e.g. hierarch, might create subcommunicators. The threadlevel requested by all processes is required in order to know which cid allocation algorithm can be used. */ - if (OMPI_SUCCESS != ( ret = ompi_comm_cid_init ())) { - error = "ompi_mpi_init: ompi_comm_cid_init failed"; + if (OMPI_SUCCESS != ( ret = ompi_cid_base_select (ompi_mpi_thread_multiple))) { + error = "ompi_mpi_init: ompi_cid_base_select failed"; goto error; } diff --git a/opal/mca/pmix/pmix.h b/opal/mca/pmix/pmix.h index c9c7c9bb900..ba0deae40ff 100644 --- a/opal/mca/pmix/pmix.h +++ b/opal/mca/pmix/pmix.h @@ -3,6 +3,8 @@ * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017-2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -876,6 +878,10 @@ typedef int (*opal_pmix_base_register_cleanup_fn_t)(char *path, bool directory, typedef bool (*opal_pmix_base_legacy_get_fn_t)(void); +/* cid */ +typedef int (*opal_pmix_base_cid_fn_t)(opal_list_t *procs, int start, int release, + opal_pmix_cid_cbfunc_t cbfunc, void *cbdata); + /* * the standard public API data structure */ @@ -912,6 +918,7 @@ typedef struct { opal_pmix_base_job_control_fn_t job_control; opal_pmix_base_process_monitor_fn_t monitor; opal_pmix_base_register_cleanup_fn_t register_cleanup; + opal_pmix_base_cid_fn_t cid_nb; /* server APIs */ opal_pmix_base_module_server_init_fn_t server_init; opal_pmix_base_module_server_finalize_fn_t server_finalize; diff --git a/opal/mca/pmix/pmix3x/pmix/include/pmix.h b/opal/mca/pmix/pmix3x/pmix/include/pmix.h index 583d2cd10f6..80a23745222 100644 --- a/opal/mca/pmix/pmix3x/pmix/include/pmix.h +++ b/opal/mca/pmix/pmix3x/pmix/include/pmix.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2013-2017 Intel, Inc. All rights reserved. - * Copyright (c) 2016 Research Organization for Information Science + * Copyright (c) 2016-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -557,6 +557,12 @@ PMIX_EXPORT pmix_status_t PMIx_Process_monitor_nb(const pmix_info_t *monitor, pm const pmix_info_t directives[], size_t ndirs, pmix_info_cbfunc_t cbfunc, void *cbdata); +/* Non-blocking version of PMIx_Fence. Note that the function will return + * an error if a _NULL_ callback function is given. */ +PMIX_EXPORT pmix_status_t PMIx_CID_nb(const pmix_proc_t procs[], size_t nprocs, + const pmix_info_t info[], size_t ninfo, + pmix_cid_cbfunc_t cbfunc, void *cbdata); + /* define a special macro to simplify sending of a heartbeat */ #define PMIx_Heartbeat() \ do { \ diff --git a/opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in b/opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in index 7d9d8ffc7d8..3f141263e78 100644 --- a/opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in +++ b/opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in @@ -1,7 +1,7 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2013-2018 Intel, Inc. All rights reserved. - * Copyright (c) 2016-2017 Research Organization for Information Science + * Copyright (c) 2016-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. * Copyright (c) 2016-2017 Mellanox Technologies, Inc. @@ -275,6 +275,8 @@ typedef uint32_t pmix_rank_t; // specified operation #define PMIX_JOB_TERM_STATUS "pmix.job.term.status" // (pmix_status_t) status returned upon job termination #define PMIX_PROC_STATE_STATUS "pmix.proc.state" // (pmix_proc_state_t) process state +#define PMIX_START_CID "pmix.cid.start" // (int) where to start the search for the next available CID +#define PMIX_RELEASE_CID "pmix.cid.release" // (int) the CID to be released /* attributes used by host server to pass data to the server convenience library - the @@ -1744,6 +1746,8 @@ typedef void (*pmix_validation_cbfunc_t)(pmix_status_t status, pmix_info_t info[], size_t ninfo, void *cbdata); +typedef void (*pmix_cid_cbfunc_t)(pmix_status_t status, + int cid, void *cbdata); /**** COMMON SUPPORT FUNCTIONS ****/ /* Register an event handler to report events. Three types of events diff --git a/opal/mca/pmix/pmix3x/pmix/src/class/Makefile.include b/opal/mca/pmix/pmix3x/pmix/src/class/Makefile.include index 904995173d3..025244757ae 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/class/Makefile.include +++ b/opal/mca/pmix/pmix3x/pmix/src/class/Makefile.include @@ -12,6 +12,8 @@ # All rights reserved. # Copyright (c) 2013-2016 Intel, Inc. All rights reserved # Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2018 Research Organization for Information Science +# and Technology (RIST). All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -30,7 +32,8 @@ headers += \ class/pmix_hash_table.h \ class/pmix_hotel.h \ class/pmix_ring_buffer.h \ - class/pmix_value_array.h + class/pmix_value_array.h \ + class/pmix_bitmap.h sources += \ class/pmix_object.c \ @@ -39,4 +42,5 @@ sources += \ class/pmix_hash_table.c \ class/pmix_hotel.c \ class/pmix_ring_buffer.c \ - class/pmix_value_array.c + class/pmix_value_array.c \ + class/pmix_bitmap.c diff --git a/opal/mca/pmix/pmix3x/pmix/src/class/pmix_bitmap.c b/opal/mca/pmix/pmix3x/pmix/src/class/pmix_bitmap.c new file mode 100644 index 00000000000..372828c94da --- /dev/null +++ b/opal/mca/pmix/pmix3x/pmix/src/class/pmix_bitmap.c @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2015-2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "pmix_config.h" + +#include +#include + +#include "src/class/pmix_bitmap.h" + +/* The number of bits in the underlying type of the bitmap field + * in the pmix_bitmap_t struct + */ +#define SIZE_OF_BASE_TYPE 64 + +static void pmix_bitmap_construct(pmix_bitmap_t *bm); +static void pmix_bitmap_destruct(pmix_bitmap_t *bm); + +PMIX_CLASS_INSTANCE(pmix_bitmap_t, pmix_object_t, + pmix_bitmap_construct, pmix_bitmap_destruct); + + +static void +pmix_bitmap_construct(pmix_bitmap_t *bm) +{ + bm->bitmap = NULL; + bm->array_size = 0; + bm->max_size = INT_MAX; +} + + +static void +pmix_bitmap_destruct(pmix_bitmap_t *bm) +{ + if (NULL != bm->bitmap) { + free(bm->bitmap); + bm->bitmap = NULL; + } +} + + +int pmix_bitmap_set_max_size (pmix_bitmap_t *bm, int max_size) +{ + if (NULL == bm) { + return PMIX_ERR_BAD_PARAM; + } + + /* + * Only if the caller wants to set the maximum size, + * we set it (in numbers of bits!), otherwise it is + * set to INT_MAX in the constructor. + */ + bm->max_size = (int)(((size_t)max_size + SIZE_OF_BASE_TYPE - 1) / SIZE_OF_BASE_TYPE); + + return PMIX_SUCCESS; +} + + +int +pmix_bitmap_init(pmix_bitmap_t *bm, int size) +{ + /* + * Only if the caller set the maximum size before initializing, + * we test here (in numbers of bits!) + * By default, the max size is INT_MAX, set in the constructor. + */ + if ((size <= 0) || (NULL == bm) || (size > bm->max_size)) { + return PMIX_ERR_BAD_PARAM; + } + + bm->array_size = (int)(((size_t)size + SIZE_OF_BASE_TYPE - 1) / SIZE_OF_BASE_TYPE); + if( NULL != bm->bitmap ) { + free(bm->bitmap); + if(bm->max_size < bm->array_size) + bm->max_size = bm->array_size; + } + bm->bitmap = (uint64_t*) malloc(bm->array_size * sizeof(uint64_t)); + if (NULL == bm->bitmap) { + return PMIX_ERR_OUT_OF_RESOURCE; + } + + pmix_bitmap_clear_all_bits(bm); + return PMIX_SUCCESS; +} + + +int +pmix_bitmap_set_bit(pmix_bitmap_t *bm, int bit) +{ + int index, offset, new_size; + + if ((bit < 0) || (NULL == bm) || (bit > bm->max_size)) { + return PMIX_ERR_BAD_PARAM; + } + + index = bit / SIZE_OF_BASE_TYPE; + offset = bit % SIZE_OF_BASE_TYPE; + + if (index >= bm->array_size) { + + /* We need to allocate more space for the bitmap, since we are + out of range. We don't throw any error here, because this is + valid and we simply expand the bitmap */ + + new_size = index + 1; + if( new_size > bm->max_size ) + new_size = bm->max_size; + + /* New size is just a multiple of the original size to fit in + the index. */ + bm->bitmap = (uint64_t*)realloc(bm->bitmap, new_size*sizeof(uint64_t)); + if (NULL == bm->bitmap) { + return PMIX_ERR_OUT_OF_RESOURCE; + } + + /* zero out the new elements */ + memset(&bm->bitmap[bm->array_size], 0, (new_size - bm->array_size) * sizeof(uint64_t)); + + /* Update the array_size */ + bm->array_size = new_size; + } + + /* Now set the bit */ + bm->bitmap[index] |= (1UL << offset); + + return PMIX_SUCCESS; +} + + +int +pmix_bitmap_clear_bit(pmix_bitmap_t *bm, int bit) +{ + int index, offset; + + if ((bit < 0) || NULL == bm || (bit >= (bm->array_size * SIZE_OF_BASE_TYPE))) { + return PMIX_ERR_BAD_PARAM; + } + + index = bit / SIZE_OF_BASE_TYPE; + offset = bit % SIZE_OF_BASE_TYPE; + + bm->bitmap[index] &= ~(1UL << offset); + return PMIX_SUCCESS; +} + + +bool +pmix_bitmap_is_set_bit(pmix_bitmap_t *bm, int bit) +{ + int index, offset; + + if ((bit < 0) || NULL == bm || (bit >= (bm->array_size * SIZE_OF_BASE_TYPE))) { + return false; + } + + index = bit / SIZE_OF_BASE_TYPE; + offset = bit % SIZE_OF_BASE_TYPE; + + if (0 != (bm->bitmap[index] & (1UL << offset))) { + return true; + } + + return false; +} + + +int +pmix_bitmap_clear_all_bits(pmix_bitmap_t *bm) +{ + if (NULL == bm) { + return PMIX_ERR_BAD_PARAM; + } + + memset(bm->bitmap, 0, bm->array_size * sizeof(uint64_t)); + return PMIX_SUCCESS; +} + + +int +pmix_bitmap_set_all_bits(pmix_bitmap_t *bm) +{ + if (NULL == bm) { + return PMIX_ERR_BAD_PARAM; + } + + memset(bm->bitmap, 0xff, bm->array_size * sizeof(uint64_t)); + + return PMIX_SUCCESS; +} + + +int +pmix_bitmap_find_and_set_first_unset_bit(pmix_bitmap_t *bm, int *position) +{ + int i = 0; + uint64_t temp, all_ones = 0xffffffffffffffffUL; + + if (NULL == bm) { + return PMIX_ERR_BAD_PARAM; + } + + /* Neglect all which don't have an unset bit */ + *position = 0; + while((i < bm->array_size) && (bm->bitmap[i] == all_ones)) { + ++i; + } + + if (i == bm->array_size) { + /* increase the bitmap size then */ + *position = bm->array_size * SIZE_OF_BASE_TYPE; + return pmix_bitmap_set_bit(bm, *position); + } + + /* This one has an unset bit, find its bit number */ + + temp = bm->bitmap[i]; + bm->bitmap[i] |= (bm->bitmap[i] + 1); /* Set the first zero bit */ + temp ^= bm->bitmap[i]; /* Compute the change: the first unset bit in the original number */ + while( !(temp & 0x1) ) { + ++(*position); + temp >>= 1; + } + + (*position) += i * SIZE_OF_BASE_TYPE; + return PMIX_SUCCESS; +} + +int pmix_bitmap_bitwise_and_inplace(pmix_bitmap_t *dest, pmix_bitmap_t *right) +{ + int i; + + /* + * Sanity check + */ + if( NULL == dest || NULL == right ) { + return PMIX_ERR_BAD_PARAM; + } + if( dest->array_size != right->array_size ) { + return PMIX_ERR_BAD_PARAM; + } + + /* + * Bitwise AND + */ + for(i = 0; i < dest->array_size; ++i) { + dest->bitmap[i] &= right->bitmap[i]; + } + + return PMIX_SUCCESS; +} + +int pmix_bitmap_bitwise_or_inplace(pmix_bitmap_t *dest, pmix_bitmap_t *right) +{ + int i; + + /* + * Sanity check + */ + if( NULL == dest || NULL == right ) { + return PMIX_ERR_BAD_PARAM; + } + if( dest->array_size != right->array_size ) { + return PMIX_ERR_BAD_PARAM; + } + + /* + * Bitwise OR + */ + for(i = 0; i < dest->array_size; ++i) { + dest->bitmap[i] |= right->bitmap[i]; + } + + return PMIX_SUCCESS; +} + +int pmix_bitmap_bitwise_xor_inplace(pmix_bitmap_t *dest, pmix_bitmap_t *right) +{ + int i; + + /* + * Sanity check + */ + if( NULL == dest || NULL == right ) { + return PMIX_ERR_BAD_PARAM; + } + if( dest->array_size != right->array_size ) { + return PMIX_ERR_BAD_PARAM; + } + + /* + * Bitwise XOR + */ + for(i = 0; i < dest->array_size; ++i) { + dest->bitmap[i] ^= right->bitmap[i]; + } + + return PMIX_SUCCESS; +} + +bool pmix_bitmap_are_different(pmix_bitmap_t *left, pmix_bitmap_t *right) +{ + int i; + + /* + * Sanity check + */ + if( NULL == left || NULL == right ) { + return PMIX_ERR_BAD_PARAM; + } + + if( pmix_bitmap_size(left) != pmix_bitmap_size(right) ) { + return true; + } + + /* + * Direct comparison + */ + for(i = 0; i < left->array_size; ++i) { + if( left->bitmap[i] != right->bitmap[i] ) { + return true; + } + } + + return false; +} + +char * pmix_bitmap_get_string(pmix_bitmap_t *bitmap) +{ + int i; + char *bitmap_str = NULL; + + if( NULL == bitmap) { + return NULL; + } + + bitmap_str = malloc(bitmap->array_size * SIZE_OF_BASE_TYPE + 1); + if (NULL == bitmap_str) { + return NULL; + } + bitmap_str[bitmap->array_size * SIZE_OF_BASE_TYPE] = '\0'; + + for( i = 0; i < (bitmap->array_size * SIZE_OF_BASE_TYPE); ++i) { + if( pmix_bitmap_is_set_bit(bitmap, i) ) { + bitmap_str[i] = 'X'; + } else { + bitmap_str[i] = '_'; + } + } + + return bitmap_str; +} + +int pmix_bitmap_num_unset_bits(pmix_bitmap_t *bm, int len) +{ + return (len - pmix_bitmap_num_set_bits(bm, len)); +} + +int pmix_bitmap_num_set_bits(pmix_bitmap_t *bm, int len) +{ + int i, cnt = 0; + uint64_t val; + +#if PMIX_ENABLE_DEBUG + if ((len < 0) || NULL == bm || (len >= (bm->array_size * SIZE_OF_BASE_TYPE))) { + return 0; + } +#endif + + for(i = 0; i < len; ++i) { + if( 0 == (val = bm->bitmap[i]) ) continue; + /* Peter Wegner in CACM 3 (1960), 322. This method goes through as many + * iterations as there are set bits. */ + for( ; val; cnt++ ) { + val &= val - 1; /* clear the least significant bit set */ + } + } + + return cnt; +} + +bool pmix_bitmap_is_clear(pmix_bitmap_t *bm) +{ + int i; + + for (i = 0; i < bm->array_size; ++i) { + if (0 != bm->bitmap[i]) { + return false; + } + } + return true; +} diff --git a/opal/mca/pmix/pmix3x/pmix/src/class/pmix_bitmap.h b/opal/mca/pmix/pmix3x/pmix/src/class/pmix_bitmap.h new file mode 100644 index 00000000000..54d7670da82 --- /dev/null +++ b/opal/mca/pmix/pmix3x/pmix/src/class/pmix_bitmap.h @@ -0,0 +1,260 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + */ + +/** @file + * + * A bitmap implementation. The bits start off with 0, so this bitmap + * has bits numbered as bit 0, bit 1, bit 2 and so on. This bitmap + * has auto-expansion capabilities, that is once the size is set + * during init, it can be automatically expanded by setting the bit + * beyond the current size. But note, this is allowed just when the + * bit is set -- so the valid functions are set_bit and + * find_and_set_bit. Other functions like clear, if passed a bit + * outside the initialized range will result in an error. + * + * To allow these bitmaps to track fortran handles (which MPI defines + * to be Fortran INTEGER), we offer a pmix_bitmap_set_max_size, so that + * the upper layer can ask to never have more than + * OMPI_FORTRAN_HANDLE_MAX, which is min(INT_MAX, fortran INTEGER max). + */ + +#ifndef PMIX_BITMAP_H +#define PMIX_BITMAP_H + +#include "pmix_config.h" + +#include + +#include "src/class/pmix_object.h" + +BEGIN_C_DECLS + +struct pmix_bitmap_t { + pmix_object_t super; /**< Subclass of pmix_object_t */ + uint64_t *bitmap; /**< The actual bitmap array of characters */ + int array_size; /**< The actual array size that maintains the bitmap */ + int max_size; /**< The maximum size that this bitmap may grow (optional) */ +}; + +typedef struct pmix_bitmap_t pmix_bitmap_t; + +PMIX_CLASS_DECLARATION(pmix_bitmap_t); + +/** + * Set the maximum size of the bitmap. + * May be reset any time, but HAS TO BE SET BEFORE pmix_bitmap_init! + * + * @param bitmap The input bitmap (IN) + * @param max_size The maximum size of the bitmap in terms of bits (IN) + * @return OPAL error code or success + * + */ +PMIX_EXPORT int pmix_bitmap_set_max_size (pmix_bitmap_t *bm, int max_size); + + +/** + * Initializes the bitmap and sets its size. This must be called + * before the bitmap can be actually used + * + * @param bitmap The input bitmap (IN) + * @param size The initial size of the bitmap in terms of bits (IN) + * @return OPAL error code or success + * + */ +PMIX_EXPORT int pmix_bitmap_init (pmix_bitmap_t *bm, int size); + + +/** + * Set a bit of the bitmap. If the bit asked for is beyond the current + * size of the bitmap, then the bitmap is extended to accomodate the + * bit + * + * @param bitmap The input bitmap (IN) + * @param bit The bit which is to be set (IN) + * @return OPAL error code or success + * + */ +PMIX_EXPORT int pmix_bitmap_set_bit(pmix_bitmap_t *bm, int bit); + + +/** + * Clear/unset a bit of the bitmap. If the bit is beyond the current + * size of the bitmap, an error is returned + * + * @param bitmap The input bitmap (IN) + * @param bit The bit which is to be cleared (IN) + * @return OPAL error code if the bit is out of range, else success + * + */ +PMIX_EXPORT int pmix_bitmap_clear_bit(pmix_bitmap_t *bm, int bit); + + +/** + * Find out if a bit is set in the bitmap + * + * @param bitmap The input bitmap (IN) + * @param bit The bit which is to be checked (IN) + * @return true if the bit is set + * false if the bit is not set OR the index + * is outside the bounds of the provided + * bitmap + * + */ +PMIX_EXPORT bool pmix_bitmap_is_set_bit(pmix_bitmap_t *bm, int bit); + + +/** + * Find the first clear bit in the bitmap and set it + * + * @param bitmap The input bitmap (IN) + * @param position Position of the first clear bit (OUT) + + * @return err PMIX_SUCCESS on success + */ +PMIX_EXPORT int pmix_bitmap_find_and_set_first_unset_bit(pmix_bitmap_t *bm, + int *position); + + +/** + * Clear all bits in the bitmap + * + * @param bitmap The input bitmap (IN) + * @return OPAL error code if bm is NULL + * + */ +PMIX_EXPORT int pmix_bitmap_clear_all_bits(pmix_bitmap_t *bm); + + +/** + * Set all bits in the bitmap + * @param bitmap The input bitmap (IN) + * @return OPAL error code if bm is NULL + * + */ +PMIX_EXPORT int pmix_bitmap_set_all_bits(pmix_bitmap_t *bm); + + +/** + * Gives the current size (number of bits) in the bitmap. This is the + * legal (accessible) number of bits + * + * @param bitmap The input bitmap (IN) + * @return OPAL error code if bm is NULL + * + */ +static inline int pmix_bitmap_size(pmix_bitmap_t *bm) +{ + return (NULL == bm) ? 0 : (bm->array_size * ((int) (sizeof(*bm->bitmap) * 8))); +} + + +/** + * Copy a bitmap + * + * @param dest Pointer to the destination bitmap + * @param src Pointer to the source bitmap + * @ return OPAL error code if something goes wrong + */ +static inline void pmix_bitmap_copy(pmix_bitmap_t *dest, pmix_bitmap_t *src) +{ + if( dest->array_size < src->array_size ) { + if( NULL != dest->bitmap) free(dest->bitmap); + dest->max_size = src->max_size; + dest->bitmap = (uint64_t*)malloc(src->array_size*sizeof(uint64_t)); + } + memcpy(dest->bitmap, src->bitmap, src->array_size * sizeof(uint64_t)); + dest->array_size = src->array_size; +} + +/** + * Bitwise AND operator (inplace) + * + * @param dest Pointer to the bitmap that should be modified + * @param right Point to the other bitmap in the operation + * @return OPAL error code if the length of the two bitmaps is not equal or one is NULL. + */ +PMIX_EXPORT int pmix_bitmap_bitwise_and_inplace(pmix_bitmap_t *dest, pmix_bitmap_t *right); + +/** + * Bitwise OR operator (inplace) + * + * @param dest Pointer to the bitmap that should be modified + * @param right Point to the other bitmap in the operation + * @return OPAL error code if the length of the two bitmaps is not equal or one is NULL. + */ +PMIX_EXPORT int pmix_bitmap_bitwise_or_inplace(pmix_bitmap_t *dest, pmix_bitmap_t *right); + +/** + * Bitwise XOR operator (inplace) + * + * @param dest Pointer to the bitmap that should be modified + * @param right Point to the other bitmap in the operation + * @return OPAL error code if the length of the two bitmaps is not equal or one is NULL. + */ +PMIX_EXPORT int pmix_bitmap_bitwise_xor_inplace(pmix_bitmap_t *dest, pmix_bitmap_t *right); + +/** + * If the bitmaps are different + * + * @param left Pointer to a bitmap + * @param right Pointer to another bitmap + * @return true if different, false if the same + */ +PMIX_EXPORT bool pmix_bitmap_are_different(pmix_bitmap_t *left, pmix_bitmap_t *right); + +/** + * Get a string representation of the bitmap. + * Useful for debugging. + * + * @param bitmap Point to the bitmap to represent + * @return Pointer to the string (caller must free if not NULL) + */ +PMIX_EXPORT char * pmix_bitmap_get_string(pmix_bitmap_t *bitmap); + +/** + * Return the number of 'unset' bits, upto the specified length + * + * @param bitmap Pointer to the bitmap + * @param len Number of bits to check + * @return Integer + */ +PMIX_EXPORT int pmix_bitmap_num_unset_bits(pmix_bitmap_t *bm, int len); + +/** + * Return the number of 'set' bits, upto the specified length + * + * @param bitmap Pointer to the bitmap + * @param len Number of bits to check + * @return Integer + */ +PMIX_EXPORT int pmix_bitmap_num_set_bits(pmix_bitmap_t *bm, int len); + +/** + * Check a bitmap to see if any bit is set + */ +PMIX_EXPORT bool pmix_bitmap_is_clear(pmix_bitmap_t *bm); + +END_C_DECLS + +#endif diff --git a/opal/mca/pmix/pmix3x/pmix/src/client/Makefile.include b/opal/mca/pmix/pmix3x/pmix/src/client/Makefile.include index 80801cc4aaf..5028c440114 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/client/Makefile.include +++ b/opal/mca/pmix/pmix3x/pmix/src/client/Makefile.include @@ -4,6 +4,8 @@ # Copyright (c) 2014 Artem Y. Polyakov . # All rights reserved. # Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2018 Research Organization for Information Science +# and Technology (RIST). All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -16,6 +18,7 @@ headers += \ sources += \ client/pmix_client.c \ + client/pmix_client_cid.c \ client/pmix_client_fence.c \ client/pmix_client_get.c \ client/pmix_client_pub.c \ diff --git a/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_cid.c b/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_cid.c new file mode 100644 index 00000000000..099f25b6cc1 --- /dev/null +++ b/opal/mca/pmix/pmix3x/pmix/src/client/pmix_client_cid.c @@ -0,0 +1,215 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include + +#include +#include + +#include +#include + +#include "src/include/pmix_globals.h" + +#ifdef HAVE_STRING_H +#include +#endif +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_SOCKET_H +#include +#endif +#ifdef HAVE_SYS_UN_H +#include +#endif +#ifdef HAVE_SYS_UIO_H +#include +#endif +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#include PMIX_EVENT_HEADER + +#include "src/class/pmix_list.h" +#include "src/mca/bfrops/bfrops.h" +#include "src/util/argv.h" +#include "src/util/error.h" +#include "src/util/hash.h" +#include "src/util/output.h" +#include "src/mca/ptl/ptl.h" + +#include "pmix_client_ops.h" + +static pmix_status_t unpack_return(pmix_buffer_t *data, int *cid); +static pmix_status_t pack_cid(pmix_buffer_t *msg, pmix_cmd_t cmd, + const pmix_proc_t *procs, size_t nprocs, + const pmix_info_t *info, size_t ninfo); +static void wait_cbfunc(struct pmix_peer_t *pr, + pmix_ptl_hdr_t *hdr, + pmix_buffer_t *buf, void *cbdata); + +PMIX_EXPORT pmix_status_t PMIx_CID_nb(const pmix_proc_t procs[], size_t nprocs, + const pmix_info_t info[], size_t ninfo, + pmix_cid_cbfunc_t cbfunc, void *cbdata) +{ + pmix_buffer_t *msg; + pmix_cmd_t cmd = PMIX_CIDNB_CMD; + pmix_status_t rc; + pmix_cb_t *cb; + + PMIX_ACQUIRE_THREAD(&pmix_global_lock); + + pmix_output_verbose(2, pmix_globals.debug_output, + "pmix: cid_nb called"); + + if (pmix_globals.init_cntr <= 0) { + PMIX_RELEASE_THREAD(&pmix_global_lock); + return PMIX_ERR_INIT; + } + + /* if we aren't connected, don't attempt to send */ + if (!pmix_globals.connected) { + PMIX_RELEASE_THREAD(&pmix_global_lock); + return PMIX_ERR_UNREACH; + } + PMIX_RELEASE_THREAD(&pmix_global_lock); + + /* check for bozo input */ + if (NULL == procs && 0 != nprocs) { + return PMIX_ERR_BAD_PARAM; + } + + msg = PMIX_NEW(pmix_buffer_t); + if (PMIX_SUCCESS != (rc = pack_cid(msg, cmd, procs, nprocs, info, ninfo))) { + PMIX_RELEASE(msg); + return rc; + } + + /* create a callback object as we need to pass it to the + * recv routine so we know which callback to use when + * the return message is recvd */ + cb = PMIX_NEW(pmix_cb_t); + cb->cbfunc.cidfn = cbfunc; + cb->cbdata = cbdata; + + /* push the message into our event base to send to the server */ + PMIX_PTL_SEND_RECV(rc, pmix_client_globals.myserver, + msg, wait_cbfunc, (void*)cb); + if (PMIX_SUCCESS != rc) { + PMIX_RELEASE(msg); + PMIX_RELEASE(cb); + } + return rc; +} + +static pmix_status_t unpack_return(pmix_buffer_t *data, int *cid) +{ + pmix_status_t rc; + pmix_status_t ret; + int32_t cnt; + + pmix_output_verbose(2, pmix_globals.debug_output, + "client:unpack cid called"); + + /* unpack the status code */ + cnt = 1; + PMIX_BFROPS_UNPACK(rc, pmix_client_globals.myserver, + data, &ret, &cnt, PMIX_STATUS); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return rc; + } + + cnt = 1; + PMIX_BFROPS_UNPACK(rc, pmix_client_globals.myserver, + data, cid, &cnt, PMIX_INT); + + pmix_output_verbose(2, pmix_globals.debug_output, + "client:unpack fence received status %d", ret); + return ret; +} + +static pmix_status_t pack_cid(pmix_buffer_t *msg, pmix_cmd_t cmd, + const pmix_proc_t *procs, size_t nprocs, + const pmix_info_t *info, size_t ninfo) +{ + pmix_status_t rc; + + /* pack the cmd */ + PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, + msg, &cmd, 1, PMIX_COMMAND); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return rc; + } + + /* pack the number of procs */ + PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, + msg, &nprocs, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return rc; + } + /* pack all provided procs */ + PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, + msg, procs, nprocs, PMIX_PROC); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return rc; + } + /* pack the number of info */ + PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, + msg, &ninfo, 1, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return rc; + } + /* pack all provided info */ + PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver, + msg, info, ninfo, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + return rc; + } + + return PMIX_SUCCESS; +} + +static void wait_cbfunc(struct pmix_peer_t *pr, pmix_ptl_hdr_t *hdr, + pmix_buffer_t *buf, void *cbdata) +{ + pmix_cb_t *cb = (pmix_cb_t*)cbdata; + pmix_status_t rc; + int cid; + + pmix_output_verbose(2, pmix_globals.debug_output, + "pmix: fence_nb callback recvd"); + + if (NULL == cb) { + PMIX_ERROR_LOG(PMIX_ERR_BAD_PARAM); + return; + } + /* a zero-byte buffer indicates that this recv is being + * completed due to a lost connection */ + if (PMIX_BUFFER_IS_EMPTY(buf)) { + rc = PMIX_ERR_UNREACH; + } else { + rc = unpack_return(buf, &cid); + } + + /* if a callback was provided, execute it */ + if (NULL != cb->cbfunc.cidfn) { + cb->cbfunc.cidfn(rc, cid, cb->cbdata); + } + PMIX_RELEASE(cb); +} diff --git a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_strings.c b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_strings.c index de5ac6ebe2f..f2014f62c82 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/common/pmix_strings.c +++ b/opal/mca/pmix/pmix3x/pmix/src/common/pmix_strings.c @@ -12,6 +12,8 @@ * Copyright (c) 2007-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -218,6 +220,8 @@ PMIX_EXPORT const char* pmix_command_string(pmix_cmd_t cmd) return "MONITOR"; case PMIX_IOF_CMD: return "IOF"; + case PMIX_CIDNB_CMD: + return "CID"; default: return "UNKNOWN"; } diff --git a/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.c b/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.c index c7aba47dd7e..66731b96128 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.c +++ b/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.c @@ -1,8 +1,7 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2017 Research Organization for Information Science - * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2015 Artem Y. Polyakov . * All rights reserved. @@ -152,12 +151,14 @@ static void info_con(pmix_rank_info_t *info) info->modex_recvd = false; info->proc_cnt = 0; info->server_object = NULL; + PMIX_CONSTRUCT(&info->cids, pmix_bitmap_t); } static void info_des(pmix_rank_info_t *info) { if (NULL != info->pname.nspace) { free(info->pname.nspace); } + PMIX_DESTRUCT(&info->cids); } PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_rank_info_t, pmix_list_item_t, diff --git a/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.h b/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.h index 3963a33b726..cd2dd16c8e1 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.h +++ b/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.h @@ -11,6 +11,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2018 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -37,6 +39,7 @@ #include "src/class/pmix_hash_table.h" #include "src/class/pmix_list.h" #include "src/class/pmix_ring_buffer.h" +#include "src/class/pmix_bitmap.h" #include "src/event/pmix_event.h" #include "src/threads/threads.h" @@ -95,6 +98,7 @@ typedef uint8_t pmix_cmd_t; #define PMIX_GET_CREDENTIAL_CMD 20 #define PMIX_VALIDATE_CRED_CMD 21 #define PMIX_IOF_CMD 22 +#define PMIX_CIDNB_CMD 23 /* provide a "pretty-print" function for cmds */ const char* pmix_command_string(pmix_cmd_t cmd); @@ -182,6 +186,7 @@ typedef struct pmix_rank_info_t { bool modex_recvd; int proc_cnt; // #clones of this rank we know about void *server_object; // pointer to rank-specific object provided by server + pmix_bitmap_t cids; } pmix_rank_info_t; PMIX_CLASS_DECLARATION(pmix_rank_info_t); @@ -349,6 +354,7 @@ typedef struct { pmix_spawn_cbfunc_t spawnfn; pmix_connect_cbfunc_t cnctfn; pmix_hdlr_reg_cbfunc_t hdlrregfn; + pmix_cid_cbfunc_t cidfn; } cbfunc; size_t errhandler_ref; void *cbdata; diff --git a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server.c b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server.c index 7d55edf0df5..8e223ecf228 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server.c +++ b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server.c @@ -1,7 +1,7 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2017 Research Organization for Information Science + * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2015 Artem Y. Polyakov . * All rights reserved. @@ -2766,6 +2766,13 @@ static pmix_status_t server_switchyard(pmix_peer_t *peer, uint32_t tag, rc = pmix_server_iofreg(peer, buf, iof_cbfunc, cd); return rc; } + if (PMIX_CIDNB_CMD == cmd) { + PMIX_GDS_CADDY(cd, peer, tag); + if (PMIX_SUCCESS != (rc = pmix_server_cid(peer, buf, cd))) { + PMIX_RELEASE(cd); + } + return rc; + } return PMIX_ERR_NOT_SUPPORTED; } diff --git a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.c b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.c index e3c78a9660d..748b2b0aad4 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.c +++ b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.c @@ -1,7 +1,7 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2017 Research Organization for Information Science + * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2015 Artem Y. Polyakov . * All rights reserved. @@ -49,6 +49,7 @@ #include PMIX_EVENT_HEADER #include "src/class/pmix_list.h" +#include "src/class/pmix_bitmap.h" #include "src/mca/bfrops/bfrops.h" #include "src/util/argv.h" #include "src/util/error.h" @@ -2722,6 +2723,169 @@ pmix_status_t pmix_server_iofreg(pmix_peer_t *peer, return rc; } +pmix_status_t pmix_server_cid(pmix_peer_t *peer, + pmix_buffer_t *buf, + pmix_server_caddy_t *cd) +{ + int32_t cnt; + pmix_status_t rc; + size_t nprocs, n, ninfo=0; + pmix_proc_t *procs=NULL; + pmix_buffer_t *reply; + pmix_info_t *info = NULL; + int start, release; + bool alloc=false, dealloc=false; + pmix_bitmap_t cids; + int cid = 0; + pmix_rank_info_t *rank, **ranks = NULL; + + pmix_output_verbose(2, pmix_server_globals.fence_output, + "recvd CID"); + + /* unpack the number of procs */ + cnt = 1; + PMIX_BFROPS_UNPACK(rc, cd->peer, buf, &nprocs, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + return rc; + } + pmix_output_verbose(2, pmix_server_globals.fence_output, + "recvd cid from %s:%u with %d procs", + cd->peer->info->pname.nspace, cd->peer->info->pname.rank, (int)nprocs); + /* there must be at least one */ + if (nprocs < 1) { + return PMIX_ERR_BAD_PARAM; + } + + /* create space for the procs */ + PMIX_PROC_CREATE(procs, nprocs); + if (NULL == procs) { + return PMIX_ERR_NOMEM; + } + /* unpack the procs */ + cnt = nprocs; + PMIX_BFROPS_UNPACK(rc, cd->peer, buf, procs, &cnt, PMIX_PROC); + if (PMIX_SUCCESS != rc) { + goto cleanup; + } + + /* unpack the number of provided info structs */ + cnt = 1; + PMIX_BFROPS_UNPACK(rc, cd->peer, buf, &ninfo, &cnt, PMIX_SIZE); + if (PMIX_SUCCESS != rc) { + return rc; + } + if (0 < ninfo) { + PMIX_INFO_CREATE(info, ninfo); + if (NULL == info) { + PMIX_PROC_FREE(procs, nprocs); + return PMIX_ERR_NOMEM; + } + /* unpack the info */ + cnt = ninfo; + PMIX_BFROPS_UNPACK(rc, cd->peer, buf, info, &cnt, PMIX_INFO); + if (PMIX_SUCCESS != rc) { + goto cleanup; + } + for (n=0; n < ninfo; n++) { + if (0 == strcmp(info[n].key, PMIX_START_CID)){ + start = info[n].value.data.integer; + alloc = true; + } else if (0 == strcmp(info[n].key, PMIX_RELEASE_CID)) { + release = info[n].value.data.integer; + dealloc = true; + } + } + PMIX_INFO_FREE(info, ninfo); + } + + ranks = malloc(nprocs * sizeof(pmix_rank_info_t *)); + /* see if we know this nspace */ + for (n=0; nnspace, procs[n].nspace)) { + nptr = tmp; + break; + } + } + if (NULL == nptr) { + /* send an error reply to the client */ + rc = PMIX_ERR_NOT_FOUND; + goto cleanup; + } + + /* see if we have this peer in our list */ + found = false; + PMIX_LIST_FOREACH(rank, &nptr->ranks, pmix_rank_info_t) { + if (rank->pname.rank == procs[n].rank) { + found = true; + break; + } + } + if (!found) { + /* send an error reply to the client */ + rc = PMIX_ERR_NOT_FOUND; + goto cleanup; + } + ranks[n] = rank; + } + + + if (alloc) { + PMIX_CONSTRUCT(&cids, pmix_bitmap_t); + int size = 0; + for (n=0; ncids); + if (sze > size) size = sze; + } + if (0 < size) { + pmix_bitmap_init(&cids, size); + } + for (n=0; ncids); + } + for (n=0; ncids, cid); + } + if (dealloc) { + pmix_bitmap_clear_bit(&ranks[n]->cids, release); + } + } + + cleanup: + PMIX_PROC_FREE(procs, nprocs); + if (NULL != ranks) { + free(ranks); + } + reply = PMIX_NEW(pmix_buffer_t); + if (NULL == reply) { + return PMIX_ERR_NOMEM; + } + PMIX_BFROPS_PACK(rc, peer, reply, &rc, 1, PMIX_STATUS); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(reply); + return rc; + } + PMIX_BFROPS_PACK(rc, peer, reply, &cid, 1, PMIX_INT); + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(reply); + return rc; + } + PMIX_PTL_SEND_ONEWAY(rc, cd->peer, reply, cd->hdr.tag); + + return rc; +} + /***** INSTANCE SERVER LIBRARY CLASSES *****/ static void tcon(pmix_server_trkr_t *t) { diff --git a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.h b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.h index 817f3011caa..3bf261159b6 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.h +++ b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.h @@ -6,7 +6,7 @@ * Copyright (c) 2015 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. - * Copyright (c) 2016 Research Organization for Information Science + * Copyright (c) 2016-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ */ @@ -273,6 +273,10 @@ pmix_status_t pmix_server_event_recvd_from_client(pmix_peer_t *peer, void *cbdata); void pmix_server_execute_collective(int sd, short args, void *cbdata); +pmix_status_t pmix_server_cid(pmix_peer_t *peer, + pmix_buffer_t *buf, + pmix_server_caddy_t *cd); + PMIX_EXPORT extern pmix_server_module_t pmix_host_server; PMIX_EXPORT extern pmix_server_globals_t pmix_server_globals; diff --git a/opal/mca/pmix/pmix3x/pmix3x.c b/opal/mca/pmix/pmix3x/pmix3x.c index 04e58c2da01..46a66cb4fad 100644 --- a/opal/mca/pmix/pmix3x/pmix3x.c +++ b/opal/mca/pmix/pmix3x/pmix3x.c @@ -1,7 +1,7 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2017 Research Organization for Information Science + * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2015 Mellanox Technologies, Inc. * All rights reserved. @@ -108,6 +108,7 @@ const opal_pmix_base_module_t opal_pmix_pmix3x_module = { .allocate = pmix3x_allocate, .job_control = pmix3x_job_control, .register_cleanup = pmix3x_register_cleanup, + .cid_nb = pmix3x_cidnb, /* server APIs */ .server_init = pmix3x_server_init, .server_finalize = pmix3x_server_finalize, diff --git a/opal/mca/pmix/pmix3x/pmix3x.h b/opal/mca/pmix/pmix3x/pmix3x.h index cbf73f644f2..e75fb39e9ee 100644 --- a/opal/mca/pmix/pmix3x/pmix3x.h +++ b/opal/mca/pmix/pmix3x/pmix3x.h @@ -3,7 +3,7 @@ * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Mellanox Technologies, Inc. * All rights reserved. - * Copyright (c) 2016 Research Organization for Information Science + * Copyright (c) 2016-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 Los Alamos National Security, LLC. All rights * reserved. @@ -109,6 +109,7 @@ typedef struct { opal_pmix_spawn_cbfunc_t spcbfunc; opal_pmix_evhandler_reg_cbfunc_t evregcbfunc; opal_pmix_info_cbfunc_t qcbfunc; + opal_pmix_cid_cbfunc_t cidcbfunc; void *cbdata; } pmix3x_opcaddy_t; OBJ_CLASS_DECLARATION(pmix3x_opcaddy_t); @@ -258,6 +259,8 @@ OPAL_MODULE_DECLSPEC int pmix3x_allocate(opal_pmix_alloc_directive_t directive, OPAL_MODULE_DECLSPEC int pmix3x_job_control(opal_list_t *targets, opal_list_t *directives, opal_pmix_info_cbfunc_t cbfunc, void *cbdata); +OPAL_MODULE_DECLSPEC int pmix3x_cidnb(opal_list_t *procs, int start, int release, + opal_pmix_cid_cbfunc_t cbfunc, void *cbdata); /**** TOOL FUNCTIONS ****/ OPAL_MODULE_DECLSPEC int pmix3x_tool_init(opal_list_t *info); diff --git a/opal/mca/pmix/pmix3x/pmix3x_client.c b/opal/mca/pmix/pmix3x/pmix3x_client.c index 4ccf21c519a..7cbe463a3ce 100644 --- a/opal/mca/pmix/pmix3x/pmix3x_client.c +++ b/opal/mca/pmix/pmix3x/pmix3x_client.c @@ -1,7 +1,7 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. - * Copyright (c) 2014-2017 Research Organization for Information Science + * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2017 Mellanox Technologies, Inc. * All rights reserved. @@ -1585,17 +1585,31 @@ static void infocbfunc(pmix_status_t status, void *release_cbdata) { pmix3x_opcaddy_t *op = (pmix3x_opcaddy_t*)cbdata; + opal_list_t results; int rc; if (NULL != release_fn) { release_fn(release_cbdata); } rc = pmix3x_convert_rc(status); + OBJ_CONSTRUCT(&results, opal_list_t); + if (PMIX_SUCCESS == status) { + for (size_t n=0; nkey = strdup(info[n].key); + status = pmix3x_value_unload(val, &info[n].value); + if (PMIX_SUCCESS == status) { + opal_list_append(&results, &val->super); + } + } + PMIX_INFO_FREE(info, ninfo); + } if (NULL != op->qcbfunc) { - op->qcbfunc(rc, NULL, op->cbdata, relcbfunc, op); + op->qcbfunc(rc, &results, op->cbdata, relcbfunc, op); } else { OBJ_RELEASE(op); } + OPAL_LIST_DESTRUCT(&results); } int pmix3x_allocate(opal_pmix_alloc_directive_t directive, @@ -1662,3 +1676,82 @@ int pmix3x_job_control(opal_list_t *targets, } return pmix3x_convert_rc(rc); } + +static void cid_cbfunc(pmix_status_t status, + int cid, void *cbdata) +{ + pmix3x_opcaddy_t *op = (pmix3x_opcaddy_t*)cbdata; + int rc; + + OPAL_ACQUIRE_OBJECT(op); + rc = pmix3x_convert_opalrc(status); + + if (NULL != op->cidcbfunc) { + op->cidcbfunc(rc, cid, op->cbdata); + } + OBJ_RELEASE(op); +} + +int pmix3x_cidnb(opal_list_t *procs, int start, int release, + opal_pmix_cid_cbfunc_t cbfunc, void *cbdata) +{ + pmix_status_t rc; + pmix_proc_t *parray=NULL; + size_t n, cnt=0; + opal_namelist_t *ptr; + pmix3x_opcaddy_t *op; + char *nsptr; + + opal_output_verbose(1, opal_pmix_base_framework.framework_output, + "PMIx_client cidnb"); + + OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock); + if (0 >= opal_pmix_base.initialized) { + OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock); + return OPAL_ERR_NOT_INITIALIZED; + } + + /* convert the list of procs to an array of pmix_proc_t */ + if (NULL != procs && 0 < (cnt = opal_list_get_size(procs))) { + PMIX_PROC_CREATE(parray, cnt); + n=0; + OPAL_LIST_FOREACH(ptr, procs, opal_namelist_t) { + if (NULL == (nsptr = pmix3x_convert_jobid(ptr->name.jobid))) { + PMIX_PROC_FREE(parray, cnt); + OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock); + return OPAL_ERR_NOT_FOUND; + } + (void)strncpy(parray[n].nspace, nsptr, PMIX_MAX_NSLEN); + parray[n].rank = pmix3x_convert_opalrank(ptr->name.vpid); + ++n; + } + } + OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock); + + /* create the caddy */ + op = OBJ_NEW(pmix3x_opcaddy_t); + op->cidcbfunc = cbfunc; + op->cbdata = cbdata; + op->procs = parray; + op->nprocs = cnt; + + op->ninfo = 0; + if (0 < start) { + op->ninfo++; + } + if (0 < release) { + op->ninfo++; + } + PMIX_INFO_CREATE(op->info, op->ninfo); + if (0 < start) { + PMIX_INFO_LOAD(&op->info[0], PMIX_START_CID, &start, PMIX_INT); + } + if (0 < release) { + PMIX_INFO_LOAD(&op->info[op->ninfo-1], PMIX_RELEASE_CID, &release, PMIX_INT); + } + + /* call the library function */ + rc = PMIx_CID_nb(op->procs, op->nprocs, op->info, op->ninfo, cid_cbfunc, op); + return pmix3x_convert_rc(rc); +} + diff --git a/opal/mca/pmix/pmix_types.h b/opal/mca/pmix/pmix_types.h index b76f0ddb3cd..503ae3f7f17 100644 --- a/opal/mca/pmix/pmix_types.h +++ b/opal/mca/pmix/pmix_types.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2014-2018 Intel, Inc. All rights reserved. - * Copyright (c) 2016 Research Organization for Information Science + * Copyright (c) 2016-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ * @@ -626,7 +626,8 @@ typedef void (*opal_pmix_tool_connection_cbfunc_t)(int status, opal_process_name_t proc, void *cbdata); - +typedef void (*opal_pmix_cid_cbfunc_t)(int status, + int cid, void *cbdata); END_C_DECLS #endif