Skip to content

Commit

Permalink
Merge pull request #1464 from ESMCI/jgfouca/remove_total_cores
Browse files Browse the repository at this point in the history
Remove TOTAL_CORES as it was inaccurate in many cases.
TOTAL_CORES was only being used by test_schedule in no-batch mode
in order to determine how many processors in the proc-pool should be
considered to be consumed by a specific test.

Users of other systems, specifically titan, were seeing this value
and wondering why it was so inaccurate.

Test_scheduler will now compute cores on the fly when needed.

COST_PES changed to take spare nodes into account.

Test suite: scripts_regression_tests
Test baseline:
Test namelist changes:
Test status: bit for bit

Fixes [CIME Github issue #]

User interface changes?: Removes TOTAL_CORES

Code review: @jedwards4b
  • Loading branch information
jedwards4b authored May 2, 2017
2 parents 6b15377 + 64b4a99 commit 965f63e
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 22 deletions.
3 changes: 2 additions & 1 deletion scripts/lib/CIME/XML/env_mach_pes.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ def get_cost_pes(self, totaltasks, max_thread_count, machine=None):
figure out the value of COST_PES which is the pe value used to estimate model cost
"""
pespn = self.get_value("PES_PER_NODE")
num_nodes = self.get_total_nodes(totaltasks, max_thread_count)[0]
num_nodes, spare_nodes = self.get_total_nodes(totaltasks, max_thread_count)
num_nodes += spare_nodes
# This is hardcoded because on yellowstone by default we
# run with 15 pes per node
# but pay for 16 pes per node. See github issue #518
Expand Down
3 changes: 0 additions & 3 deletions scripts/lib/CIME/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -824,9 +824,6 @@ def configure(self, compset_name, grid_name, machine_name=None,
if pio_typename in ("pnetcdf", "netcdf4p"):
self.set_value(key, "netcdf")

# Set TOTAL_CORES
self.set_value("TOTAL_CORES", self.total_tasks * self.cores_per_task )

if input_dir is not None:
self.set_value("DIN_LOC_ROOT", os.path.abspath(input_dir))

Expand Down
6 changes: 0 additions & 6 deletions scripts/lib/CIME/case_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,6 @@ def _case_setup_impl(case, caseroot, clean=False, test_mode=False, reset=False):
logger.info("Machine/Decomp/Pes configuration has already been done ...skipping")

case.initialize_derived_attributes()

# Set TOTAL_CORES
case.set_value("TOTAL_CORES", case.total_tasks * case.cores_per_task )
else:
check_pelayouts_require_rebuild(case, models)

Expand Down Expand Up @@ -176,9 +173,6 @@ def _case_setup_impl(case, caseroot, clean=False, test_mode=False, reset=False):
logger.info("Writing %s script from input template %s" % (job, input_batch_script))
env_batch.make_batch_script(input_batch_script, job, case, pestot, tasks_per_node, num_nodes, thread_count)

# Set TOTAL_CORES
case.set_value("TOTAL_CORES", case.total_tasks * case.cores_per_task )

# Make a copy of env_mach_pes.xml in order to be able
# to check that it does not change once case.setup is invoked
logger.info("Locking file env_mach_pes.xml")
Expand Down
14 changes: 12 additions & 2 deletions scripts/lib/CIME/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,8 +593,18 @@ def _get_procs_needed(self, test, phase, threads_in_flight=None, no_batch=False)
###########################################################################
if phase == RUN_PHASE and (self._no_batch or no_batch):
test_dir = self._get_test_dir(test)
out = run_cmd_no_fail("./xmlquery TOTAL_CORES -value", from_dir=test_dir)
return int(out)
total_pes = int(run_cmd_no_fail("./xmlquery TOTALPES --value", from_dir=test_dir))
threads = eval(run_cmd_no_fail("./xmlquery NTHRDS --value", from_dir=test_dir))
max_threads = 0
for item in threads:
_, comp_threads = item.split(":")
comp_threads = int(comp_threads)
if comp_threads > max_threads:
max_threads = comp_threads

max_cores = total_pes * max_threads
return max_cores

elif (phase == SHAREDLIB_BUILD_PHASE):
# Will force serialization of sharedlib builds
# TODO - instead of serializing, compute all library configs needed and build
Expand Down
2 changes: 1 addition & 1 deletion scripts/tests/scripts_regression_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1385,7 +1385,7 @@ def test_cime_case_force_pecount(self):
self.assertEqual(case.get_value("NTHRDS_CPL"), 8)

expected_cores = 16 * case.cores_per_task
self.assertEqual(case.get_value("TOTAL_CORES"), expected_cores)
self.assertEqual(case.get_value("COST_PES"), expected_cores)

###########################################################################
def test_cime_case_xmlchange_append(self):
Expand Down
9 changes: 0 additions & 9 deletions src/drivers/mct/cime_config/config_component.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2037,7 +2037,6 @@
<desc>Number of instances for each component</desc>
</entry>


<entry id="TOTALPES">
<type>integer</type>
<default_value>0</default_value>
Expand All @@ -2046,14 +2045,6 @@
<desc>total number of tasks and threads (setup automatically - DO NOT EDIT)</desc>
</entry>

<entry id="TOTAL_CORES">
<type>integer</type>
<default_value>1</default_value>
<group>mach_pes_last</group>
<file>env_mach_pes.xml</file>
<desc>total number of cores used (setup automatically - DO NOT EDIT)</desc>
</entry>

<entry id="MAX_TASKS_PER_NODE">
<type>integer</type>
<default_value>0</default_value>
Expand Down

0 comments on commit 965f63e

Please sign in to comment.