Skip to content

Commit

Permalink
Merge branch 'jgfouca/cime/core_file_support' into master (PR #1353)
Browse files Browse the repository at this point in the history
Add better support for core files

Use python resource library, similar to ulimit for bash, to
try to remove any size limits on core files.

Fixes #1308

[BFB]

* jgfouca/cime/core_file_support:
  Add better support for core files
  • Loading branch information
jgfouca committed Mar 29, 2017
1 parent 4865b73 commit eca0f4b
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 9 deletions.
15 changes: 9 additions & 6 deletions utils/python/CIME/aprun.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,14 @@ def _get_aprun_cmd_for_case_impl(ntasks, nthreads, rootpes, pstrids,

c2 += 1

logger.info("total tasks is: %s" % total_tasks)

# make sure all maxt values at least 1
for c1 in xrange(0, total_tasks):
if maxt[c1] < 1:
maxt[c1] = 1

# Compute task and thread settings for batch commands
tasks_per_node, task_count, thread_count, max_thread_count, aprun = \
0, 1, maxt[0], maxt[0], "aprun"
tasks_per_node, task_count, thread_count, max_thread_count, total_node_count, aprun = \
0, 1, maxt[0], maxt[0], 0, "aprun"
for c1 in xrange(1, total_tasks):
if maxt[c1] != thread_count:
tasks_per_node = min(pes_per_node, max_tasks_per_node / thread_count)
Expand All @@ -83,6 +81,9 @@ def _get_aprun_cmd_for_case_impl(ntasks, nthreads, rootpes, pstrids,

aprun += " -n %d -N %d -d %d %s :" % (task_count, tasks_per_node, thread_count, run_exe)

node_count = int(math.ceil(float(task_count) / tasks_per_node))
total_node_count += node_count

thread_count = maxt[c1]
max_thread_count = max(max_thread_count, maxt[c1])
task_count = 1
Expand All @@ -99,6 +100,8 @@ def _get_aprun_cmd_for_case_impl(ntasks, nthreads, rootpes, pstrids,

task_per_numa = int(math.ceil(tasks_per_node / 2.0))

total_node_count += int(math.ceil(float(task_count) / tasks_per_node))

# Special option for Titan with intel compiler
if machine == "titan" and tasks_per_node > 1:
aprun += " -S %d" % task_per_numa
Expand All @@ -107,13 +110,13 @@ def _get_aprun_cmd_for_case_impl(ntasks, nthreads, rootpes, pstrids,

aprun += " -n %d -N %d -d %d %s " % (task_count, tasks_per_node, thread_count, run_exe)

return aprun
return aprun, total_node_count

###############################################################################
def get_aprun_cmd_for_case(case, run_exe):
###############################################################################
"""
Given a case, construct and return the aprun command
Given a case, construct and return the aprun command and optimized node count
"""
models = case.get_values("COMP_CLASSES")
ntasks, nthreads, rootpes, pstrids = [], [], [], []
Expand Down
9 changes: 7 additions & 2 deletions utils/python/CIME/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,19 @@ def _initialize_derived_attributes(self):
self.thread_count = env_mach_pes.get_max_thread_count(comp_classes)
self.tasks_per_node = env_mach_pes.get_tasks_per_node(self.total_tasks, self.thread_count)
logger.debug("total_tasks %s thread_count %s"%(self.total_tasks, self.thread_count))
self.num_nodes = env_mach_pes.get_total_nodes(self.total_tasks, self.thread_count)

self.tasks_per_numa = int(math.ceil(self.tasks_per_node / 2.0))
smt_factor = max(1,int(self.get_value("MAX_TASKS_PER_NODE") / pes_per_node))

threads_per_node = self.tasks_per_node * self.thread_count
threads_per_core = 1 if (threads_per_node <= pes_per_node) else smt_factor
self.cores_per_task = self.thread_count / threads_per_core

if self.get_value("MACH") == "titan":
self.num_nodes = get_aprun_cmd_for_case(self, "acme.exe")[1]
else:
self.num_nodes = env_mach_pes.get_total_nodes(self.total_tasks, self.thread_count)

# Define __enter__ and __exit__ so that we can use this as a context manager
# and force a flush on exit.
def __enter__(self):
Expand Down Expand Up @@ -1094,7 +1099,7 @@ def get_mpirun_cmd(self, job="case.run"):

# special case for aprun
if executable == "aprun":
return get_aprun_cmd_for_case(self, run_exe) + " " + run_misc_suffix
return get_aprun_cmd_for_case(self, run_exe)[0] + " " + run_misc_suffix
else:
mpi_arg_string = " ".join(args.values())

Expand Down
2 changes: 1 addition & 1 deletion utils/python/CIME/case_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def _case_setup_impl(case, caseroot, clean=False, test_mode=False, reset=False):
# create batch files
logger.info("Creating batch script case.run")
env_batch = case.get_env("batch")
num_nodes = env_mach_pes.get_total_nodes(pestot, thread_count)
num_nodes = case.num_nodes
tasks_per_node = env_mach_pes.get_tasks_per_node(pestot, thread_count)
for job in env_batch.get_jobs():
input_batch_script = os.path.join(case.get_value("MACHDIR"), env_batch.get_value('template', subgroup=job))
Expand Down

0 comments on commit eca0f4b

Please sign in to comment.