Skip to content

Commit

Permalink
Restore optimized num nodes.
Browse files Browse the repository at this point in the history
On titan, the aprun command is now correct but num_nodes was
incorrect due to not using the 'optimized' num nodes that task maker
was programmed to use with aprun long ago. This commit restores
that capability.

[BFB]
  • Loading branch information
jgfouca committed Mar 28, 2017
1 parent 48fc6b3 commit c391d73
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 9 deletions.
15 changes: 9 additions & 6 deletions cime/utils/python/CIME/aprun.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,14 @@ def _get_aprun_cmd_for_case_impl(ntasks, nthreads, rootpes, pstrids,

c2 += 1

logger.info("total tasks is: %s" % total_tasks)

# make sure all maxt values at least 1
for c1 in xrange(0, total_tasks):
if maxt[c1] < 1:
maxt[c1] = 1

# Compute task and thread settings for batch commands
tasks_per_node, task_count, thread_count, max_thread_count, aprun = \
0, 1, maxt[0], maxt[0], "aprun"
tasks_per_node, task_count, thread_count, max_thread_count, total_node_count, aprun = \
0, 1, maxt[0], maxt[0], 0, "aprun"
for c1 in xrange(1, total_tasks):
if maxt[c1] != thread_count:
tasks_per_node = min(pes_per_node, max_tasks_per_node / thread_count)
Expand All @@ -83,6 +81,9 @@ def _get_aprun_cmd_for_case_impl(ntasks, nthreads, rootpes, pstrids,

aprun += " -n %d -N %d -d %d %s :" % (task_count, tasks_per_node, thread_count, run_exe)

node_count = int(math.ceil(float(task_count) / tasks_per_node))
total_node_count += node_count

thread_count = maxt[c1]
max_thread_count = max(max_thread_count, maxt[c1])
task_count = 1
Expand All @@ -99,6 +100,8 @@ def _get_aprun_cmd_for_case_impl(ntasks, nthreads, rootpes, pstrids,

task_per_numa = int(math.ceil(tasks_per_node / 2.0))

total_node_count += int(math.ceil(float(task_count) / tasks_per_node))

# Special option for Titan with intel compiler
if machine == "titan" and tasks_per_node > 1:
aprun += " -S %d" % task_per_numa
Expand All @@ -107,13 +110,13 @@ def _get_aprun_cmd_for_case_impl(ntasks, nthreads, rootpes, pstrids,

aprun += " -n %d -N %d -d %d %s " % (task_count, tasks_per_node, thread_count, run_exe)

return aprun
return aprun, total_node_count

###############################################################################
def get_aprun_cmd_for_case(case, run_exe):
###############################################################################
"""
Given a case, construct and return the aprun command
Given a case, construct and return the aprun command and optimized node count
"""
models = case.get_values("COMP_CLASSES")
ntasks, nthreads, rootpes, pstrids = [], [], [], []
Expand Down
9 changes: 7 additions & 2 deletions cime/utils/python/CIME/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,19 @@ def _initialize_derived_attributes(self):
self.thread_count = env_mach_pes.get_max_thread_count(comp_classes)
self.tasks_per_node = env_mach_pes.get_tasks_per_node(self.total_tasks, self.thread_count)
logger.debug("total_tasks %s thread_count %s"%(self.total_tasks, self.thread_count))
self.num_nodes = env_mach_pes.get_total_nodes(self.total_tasks, self.thread_count)

self.tasks_per_numa = int(math.ceil(self.tasks_per_node / 2.0))
smt_factor = max(1,int(self.get_value("MAX_TASKS_PER_NODE") / pes_per_node))

threads_per_node = self.tasks_per_node * self.thread_count
threads_per_core = 1 if (threads_per_node <= pes_per_node) else smt_factor
self.cores_per_task = self.thread_count / threads_per_core

if self.get_value("MACH") == "titan":
self.num_nodes = get_aprun_cmd_for_case(self, "acme.exe")[1]
else:
self.num_nodes = env_mach_pes.get_total_nodes(self.total_tasks, self.thread_count)

# Define __enter__ and __exit__ so that we can use this as a context manager
# and force a flush on exit.
def __enter__(self):
Expand Down Expand Up @@ -1094,7 +1099,7 @@ def get_mpirun_cmd(self, job="case.run"):

# special case for aprun
if executable == "aprun":
return get_aprun_cmd_for_case(self, run_exe) + " " + run_misc_suffix
return get_aprun_cmd_for_case(self, run_exe)[0] + " " + run_misc_suffix
else:
mpi_arg_string = " ".join(args.values())

Expand Down
2 changes: 1 addition & 1 deletion cime/utils/python/CIME/case_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def _case_setup_impl(case, caseroot, clean=False, test_mode=False, reset=False):
# create batch files
logger.info("Creating batch script case.run")
env_batch = case.get_env("batch")
num_nodes = env_mach_pes.get_total_nodes(pestot, thread_count)
num_nodes = case.num_nodes
tasks_per_node = env_mach_pes.get_tasks_per_node(pestot, thread_count)
for job in env_batch.get_jobs():
input_batch_script = os.path.join(case.get_value("MACHDIR"), env_batch.get_value('template', subgroup=job))
Expand Down

0 comments on commit c391d73

Please sign in to comment.