Skip to content

Commit a95de6e

Browse files
author
rhc54
committed
Merge pull request #1353 from rhc54/topic/host
Per the discussion on the telecon, change the -host behavior yet again
2 parents 74293bc + 503e127 commit a95de6e

File tree

9 files changed

+108
-89
lines changed

9 files changed

+108
-89
lines changed

orte/mca/plm/base/base.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
13-
* Copyright (c) 2015 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
1414
* $COPYRIGHT$
1515
*
1616
* Additional copyrights may follow
@@ -53,6 +53,7 @@ ORTE_DECLSPEC int orte_plm_base_select(void);
5353
ORTE_DECLSPEC void orte_plm_base_app_report_launch(int fd, short event, void *data);
5454
ORTE_DECLSPEC void orte_plm_base_receive_process_msg(int fd, short event, void *data);
5555

56+
ORTE_DECLSPEC void orte_plm_base_set_slots(orte_node_t *node);
5657
ORTE_DECLSPEC void orte_plm_base_setup_job(int fd, short args, void *cbdata);
5758
ORTE_DECLSPEC void orte_plm_base_setup_job_complete(int fd, short args, void *cbdata);
5859
ORTE_DECLSPEC void orte_plm_base_complete_setup(int fd, short args, void *cbdata);

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 31 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,36 @@
7979
#include "orte/mca/plm/base/plm_private.h"
8080
#include "orte/mca/plm/base/base.h"
8181

82+
void orte_plm_base_set_slots(orte_node_t *node)
83+
{
84+
if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
85+
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
86+
HWLOC_OBJ_CORE, 0,
87+
OPAL_HWLOC_LOGICAL);
88+
} else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
89+
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
90+
HWLOC_OBJ_SOCKET, 0,
91+
OPAL_HWLOC_LOGICAL))) {
92+
/* some systems don't report sockets - in this case,
93+
* use numanodes */
94+
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
95+
HWLOC_OBJ_NODE, 0,
96+
OPAL_HWLOC_LOGICAL);
97+
}
98+
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
99+
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
100+
HWLOC_OBJ_NODE, 0,
101+
OPAL_HWLOC_LOGICAL);
102+
} else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
103+
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
104+
HWLOC_OBJ_PU, 0,
105+
OPAL_HWLOC_LOGICAL);
106+
} else {
107+
/* must be a number */
108+
node->slots = strtol(orte_set_slots, NULL, 10);
109+
}
110+
}
111+
82112
void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
83113
{
84114
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
@@ -148,33 +178,7 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
148178
OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
149179
"%s plm:base:setting slots for node %s by %s",
150180
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, orte_set_slots));
151-
if (0 == strncmp(orte_set_slots, "cores", strlen(orte_set_slots))) {
152-
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
153-
HWLOC_OBJ_CORE, 0,
154-
OPAL_HWLOC_LOGICAL);
155-
} else if (0 == strncmp(orte_set_slots, "sockets", strlen(orte_set_slots))) {
156-
if (0 == (node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
157-
HWLOC_OBJ_SOCKET, 0,
158-
OPAL_HWLOC_LOGICAL))) {
159-
/* some systems don't report sockets - in this case,
160-
* use numanodes
161-
*/
162-
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
163-
HWLOC_OBJ_NODE, 0,
164-
OPAL_HWLOC_LOGICAL);
165-
}
166-
} else if (0 == strncmp(orte_set_slots, "numas", strlen(orte_set_slots))) {
167-
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
168-
HWLOC_OBJ_NODE, 0,
169-
OPAL_HWLOC_LOGICAL);
170-
} else if (0 == strncmp(orte_set_slots, "hwthreads", strlen(orte_set_slots))) {
171-
node->slots = opal_hwloc_base_get_nbobjs_by_type(node->topology,
172-
HWLOC_OBJ_PU, 0,
173-
OPAL_HWLOC_LOGICAL);
174-
} else {
175-
/* must be a number */
176-
node->slots = strtol(orte_set_slots, NULL, 10);
177-
}
181+
orte_plm_base_set_slots(node);
178182
}
179183
}
180184
}

orte/mca/ras/base/ras_base_allocate.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
1313
* reserved.
14-
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
14+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1515
* $COPYRIGHT$
1616
*
1717
* Additional copyrights may follow
@@ -77,8 +77,8 @@ void orte_ras_base_display_alloc(void)
7777
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
7878
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse);
7979
} else {
80-
asprintf(&tmp2, "\t%s: slots=%d max_slots=%d slots_inuse=%d state=%s\n",
81-
(NULL == alloc->name) ? "UNKNOWN" : alloc->name,
80+
asprintf(&tmp2, "\t%s: flags=0x%02x slots=%d max_slots=%d slots_inuse=%d state=%s\n",
81+
(NULL == alloc->name) ? "UNKNOWN" : alloc->name, alloc->flags,
8282
(int)alloc->slots, (int)alloc->slots_max, (int)alloc->slots_inuse,
8383
orte_node_state_to_str(alloc->state));
8484
}

orte/mca/ras/base/ras_base_node.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
1313
* reserved.
14-
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
14+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1515
* Copyright (c) 2015 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
* $COPYRIGHT$

orte/mca/rmaps/base/help-orte-rmaps-base.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,3 +404,9 @@ or provide more node locations in the file.
404404
The request to map processes by distance could not be completed
405405
because device to map near by was not specified. Please, use
406406
rmaps_dist_device mca parameter to set it.
407+
#
408+
[num-procs-not-specified]
409+
Either the -host or -hostfile options were given, but the number
410+
of processes to start was omitted. This combination is not supported.
411+
412+
Please specify the number of processes to run and try again.

orte/mca/rmaps/base/rmaps_base_map_job.c

Lines changed: 46 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
1414
* All rights reserved.
15-
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1616
* Copyright (c) 2016 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
1818
* $COPYRIGHT$
@@ -50,8 +50,9 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
5050
{
5151
orte_job_t *jdata;
5252
orte_job_map_t *map;
53+
orte_node_t *node;
5354
int rc, i;
54-
bool did_map;
55+
bool did_map, given;
5556
orte_rmaps_base_selected_module_t *mod;
5657
orte_job_t *parent;
5758
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
@@ -71,6 +72,47 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
7172
"mca:rmaps: mapping job %s",
7273
ORTE_JOBID_PRINT(jdata->jobid));
7374

75+
/* compute the number of procs and check validity */
76+
nprocs = 0;
77+
for (i=0; i < jdata->apps->size; i++) {
78+
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
79+
if (0 == app->num_procs) {
80+
opal_list_t nodes;
81+
orte_std_cntr_t slots;
82+
OBJ_CONSTRUCT(&nodes, opal_list_t);
83+
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
84+
/* if we are in a managed allocation, then all is good - otherwise,
85+
* we have to do a little more checking */
86+
if (!orte_managed_allocation) {
87+
/* if all the nodes have their slots given, then we are okay */
88+
given = true;
89+
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
90+
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
91+
given = false;
92+
break;
93+
}
94+
}
95+
/* if -host or -hostfile was given, and the slots were not,
96+
* then this is no longer allowed */
97+
if (!given &&
98+
(orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, NULL, OPAL_STRING) ||
99+
orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING))) {
100+
/* inform the user of the error */
101+
orte_show_help("help-orte-rmaps-base.txt", "num-procs-not-specified", true);
102+
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
103+
OBJ_RELEASE(caddy);
104+
OPAL_LIST_DESTRUCT(&nodes);
105+
return;
106+
}
107+
}
108+
OPAL_LIST_DESTRUCT(&nodes);
109+
nprocs += slots;
110+
} else {
111+
nprocs += app->num_procs;
112+
}
113+
}
114+
}
115+
74116
/* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE
75117
* THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE
76118
* PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY
@@ -91,22 +133,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
91133
OBJ_RELEASE(caddy);
92134
return;
93135
}
94-
/* compute the number of procs */
95-
nprocs = 0;
96-
for (i=0; i < jdata->apps->size; i++) {
97-
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
98-
if (0 == app->num_procs) {
99-
opal_list_t nodes;
100-
orte_std_cntr_t slots;
101-
OBJ_CONSTRUCT(&nodes, opal_list_t);
102-
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
103-
OPAL_LIST_DESTRUCT(&nodes);
104-
nprocs += slots;
105-
} else {
106-
nprocs += app->num_procs;
107-
}
108-
}
109-
}
110136
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
111137
"mca:rmaps: nprocs %s",
112138
ORTE_VPID_PRINT(nprocs));
@@ -142,12 +168,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
142168
}
143169
/* check for oversubscribe directives */
144170
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
145-
if (orte_managed_allocation) {
146-
/* by default, we do not allow oversubscription in managed environments */
147-
ORTE_SET_MAPPING_DIRECTIVE(map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
148-
} else {
149-
ORTE_UNSET_MAPPING_DIRECTIVE(map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
150-
}
171+
ORTE_SET_MAPPING_DIRECTIVE(map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
151172
} else {
152173
/* pass along the directive */
153174
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
@@ -179,13 +200,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
179200
if (!jdata->map->display_map) {
180201
jdata->map->display_map = orte_rmaps_base.display_map;
181202
}
182-
/* compute the number of procs */
183-
nprocs = 0;
184-
for (i=0; i < jdata->apps->size; i++) {
185-
if (NULL != (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
186-
nprocs += app->num_procs;
187-
}
188-
}
189203
/* set the default mapping policy IFF it wasn't provided */
190204
if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
191205
/* default based on number of procs */
@@ -215,12 +229,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
215229
}
216230
/* check for oversubscribe directives */
217231
if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
218-
if (orte_managed_allocation) {
219-
/* by default, we do not allow oversubscription in managed environments */
220-
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
221-
} else {
222-
ORTE_UNSET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
223-
}
232+
ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
224233
} else {
225234
/* pass along the directive */
226235
if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {

orte/runtime/data_type_support/orte_dt_print_fns.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,8 +345,8 @@ int orte_dt_print_node(char **output, char *prefix, orte_node_t *src, opal_data_
345345
goto PRINT_PROCS;
346346
}
347347

348-
asprintf(&tmp, "\n%sData for node: %s\tState: %0x",
349-
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name, src->state);
348+
asprintf(&tmp, "\n%sData for node: %s\tState: %0x\tFlags: %02x",
349+
pfx2, (NULL == src->name) ? "UNKNOWN" : src->name, src->state, src->flags);
350350
/* does this node have any aliases? */
351351
tmp3 = NULL;
352352
if (orte_get_attribute(&src->attributes, ORTE_NODE_ALIAS, (void**)&tmp3, OPAL_STRING)) {

orte/util/attr.h

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,22 +29,22 @@ typedef uint8_t orte_app_context_flags_t;
2929

3030

3131
/* APP_CONTEXT ATTRIBUTE KEYS */
32-
#define ORTE_APP_HOSTFILE 1 // string - hostfile
33-
#define ORTE_APP_ADD_HOSTFILE 2 // string - hostfile to be added
34-
#define ORTE_APP_DASH_HOST 3 // string - hosts specified with -host option
35-
#define ORTE_APP_ADD_HOST 4 // string - hosts to be added
36-
#define ORTE_APP_USER_CWD 5 // bool - user specified cwd
37-
#define ORTE_APP_SSNDIR_CWD 6 // bool - use session dir as cwd
38-
#define ORTE_APP_PRELOAD_BIN 7 // bool - move binaries to remote nodes prior to exec
39-
#define ORTE_APP_PRELOAD_FILES 8 // string - files to be moved to remote nodes prior to exec
40-
#define ORTE_APP_SSTORE_LOAD 9 // string
41-
#define ORTE_APP_RECOV_DEF 10 // bool - whether or not a recovery policy was defined
42-
#define ORTE_APP_MAX_RESTARTS 11 // int32 - max number of times a process can be restarted
43-
#define ORTE_APP_MIN_NODES 12 // int64 - min number of nodes required
44-
#define ORTE_APP_MANDATORY 13 // bool - flag if nodes requested in -host are "mandatory" vs "optional"
45-
#define ORTE_APP_MAX_PPN 14 // uint32 - maximum number of procs/node for this app
46-
#define ORTE_APP_PREFIX_DIR 15 // string - prefix directory for this app, if override necessary
47-
#define ORTE_APP_NO_CACHEDIR 16 // bool - flag that a cache dir is not to be specified for a Singularity container
32+
#define ORTE_APP_HOSTFILE 1 // string - hostfile
33+
#define ORTE_APP_ADD_HOSTFILE 2 // string - hostfile to be added
34+
#define ORTE_APP_DASH_HOST 3 // string - hosts specified with -host option
35+
#define ORTE_APP_ADD_HOST 4 // string - hosts to be added
36+
#define ORTE_APP_USER_CWD 5 // bool - user specified cwd
37+
#define ORTE_APP_SSNDIR_CWD 6 // bool - use session dir as cwd
38+
#define ORTE_APP_PRELOAD_BIN 7 // bool - move binaries to remote nodes prior to exec
39+
#define ORTE_APP_PRELOAD_FILES 8 // string - files to be moved to remote nodes prior to exec
40+
#define ORTE_APP_SSTORE_LOAD 9 // string
41+
#define ORTE_APP_RECOV_DEF 10 // bool - whether or not a recovery policy was defined
42+
#define ORTE_APP_MAX_RESTARTS 11 // int32 - max number of times a process can be restarted
43+
#define ORTE_APP_MIN_NODES 12 // int64 - min number of nodes required
44+
#define ORTE_APP_MANDATORY 13 // bool - flag if nodes requested in -host are "mandatory" vs "optional"
45+
#define ORTE_APP_MAX_PPN 14 // uint32 - maximum number of procs/node for this app
46+
#define ORTE_APP_PREFIX_DIR 15 // string - prefix directory for this app, if override necessary
47+
#define ORTE_APP_NO_CACHEDIR 16 // bool - flag that a cache dir is not to be specified for a Singularity container
4848

4949
#define ORTE_APP_MAX_KEY 100
5050

orte/util/dash_host/dash_host.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
13-
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
1414
* Copyright (c) 2015 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
1616
* $COPYRIGHT$
@@ -249,7 +249,6 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
249249
}
250250
} else {
251251
node->slots = 1;
252-
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
253252
}
254253
opal_list_append(&adds, &node->super);
255254
}

0 commit comments

Comments
 (0)