Skip to content

Commit

Permalink
Get new sandia desktop machine 'climate' running scripts_regression_t…
Browse files Browse the repository at this point in the history
…ests

Also:
1) Change test_schedule to not hang forever if a test asks for more
processes than exist in the proc pool.
2) Change CMakeTester and MakeTester to not throw exceptions when
command fail. Instead, make a test-assert failure with a nice
error message. This made it much easier to diagnose failures in
these tests on climate.
  • Loading branch information
jgfouca committed Sep 2, 2016
1 parent 4bcb7d8 commit 67a8165
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 19 deletions.
4 changes: 4 additions & 0 deletions cime_config/acme/allactive/config_pes.xml.cime2
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@
<NTASKS_ATM>64</NTASKS_ATM> <NTHRDS_ATM>1</NTHRDS_ATM> <ROOTPE_ATM>0</ROOTPE_ATM> <NINST_ATM>1</NINST_ATM>
</pes>

<pes GRID="a%ne30np4" MACH="sandia-srn-sems">
<NTASKS_ATM>64</NTASKS_ATM> <NTHRDS_ATM>1</NTHRDS_ATM> <ROOTPE_ATM>0</ROOTPE_ATM> <NINST_ATM>1</NINST_ATM>
</pes>

<pes GRID="a%ne30np4" MACH="edison">
<NTASKS_ATM>960</NTASKS_ATM> <NTHRDS_ATM>4</NTHRDS_ATM> <ROOTPE_ATM>0</ROOTPE_ATM> <NINST_ATM>1</NINST_ATM>
<PIO_NUMTASKS>-1</PIO_NUMTASKS><PIO_STRIDE>24</PIO_STRIDE> <PIO_TYPENAME>pnetcdf</PIO_TYPENAME> <PIO_ROOT>1</PIO_ROOT>
Expand Down
35 changes: 35 additions & 0 deletions cime_config/acme/allactive/config_pesall.xml
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,41 @@
</rootpe>
</pes>
</mach>
<mach name="sandia-srn-sems">
<pes compset="any" pesize="any">
<comment>none</comment>
<ntasks>
<ntasks_atm>64</ntasks_atm>
<ntasks_lnd>64</ntasks_lnd>
<ntasks_rof>64</ntasks_rof>
<ntasks_ice>64</ntasks_ice>
<ntasks_ocn>64</ntasks_ocn>
<ntasks_glc>64</ntasks_glc>
<ntasks_wav>64</ntasks_wav>
<ntasks_cpl>64</ntasks_cpl>
</ntasks>
<nthrds>
<nthrds_atm>1</nthrds_atm>
<nthrds_lnd>1</nthrds_lnd>
<nthrds_rof>1</nthrds_rof>
<nthrds_ice>1</nthrds_ice>
<nthrds_ocn>1</nthrds_ocn>
<nthrds_glc>1</nthrds_glc>
<nthrds_wav>1</nthrds_wav>
<nthrds_cpl>1</nthrds_cpl>
</nthrds>
<rootpe>
<rootpe_atm>0</rootpe_atm>
<rootpe_lnd>0</rootpe_lnd>
<rootpe_rof>0</rootpe_rof>
<rootpe_ice>0</rootpe_ice>
<rootpe_ocn>0</rootpe_ocn>
<rootpe_glc>0</rootpe_glc>
<rootpe_wav>0</rootpe_wav>
<rootpe_cpl>0</rootpe_cpl>
</rootpe>
</pes>
</mach>
</grid>
<grid name="a%ne30np4">
<mach name="edison">
Expand Down
2 changes: 1 addition & 1 deletion cime_config/acme/machines/config_compilers.xml
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,7 @@ for mct, etc.
<CONFIG_ARGS> --host=Linux </CONFIG_ARGS>
<NETCDF_PATH>$(NETCDFROOT)</NETCDF_PATH>
<PNETCDF_PATH>$(PNETCDFROOT)</PNETCDF_PATH>
<ADD_SLIBS> $(shell $(NETCDF_PATH)/bin/nf-config --flibs) -lblas -llapack</ADD_SLIBS>
<ADD_SLIBS> $(shell $(NETCDF_PATH)/bin/nf-config --flibs) -L/usr/lib64 -L/usr/lib64/atlas -lblas -llapack</ADD_SLIBS>
<CXX_LIBS>-lstdc++ -lmpi_cxx</CXX_LIBS>
<ALBANY_PATH>/projects/install/rhel6-x86_64/ACME/AlbanyTrilinos/Albany/build/install</ALBANY_PATH>
</compiler>
Expand Down
4 changes: 3 additions & 1 deletion cime_config/acme/machines/config_machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@

<machine MACH="sandia-srn-sems">
<DESC>Linux workstation at Sandia on SRN with SEMS TPL modules</DESC>
<NODENAME_REGEX>s999964</NODENAME_REGEX>
<NODENAME_REGEX>(s999964|climate)</NODENAME_REGEX>
<PROXY>wwwproxy.sandia.gov:80</PROXY>
<TESTS>acme_developer</TESTS>
<OS>LINUX</OS>
Expand All @@ -400,6 +400,7 @@
<DOUT_S_ROOT>$CESMSCRATCHROOT/archive/$CASE</DOUT_S_ROOT>
<DOUT_L_MSROOT>csm/$CASE</DOUT_L_MSROOT>
<CCSM_BASELINE>/sems-data-store/ACME/baselines</CCSM_BASELINE>
<SAVE_TIMING_DIR>/sems-data-store/ACME/timings</SAVE_TIMING_DIR>
<CCSM_CPRNC>/sems-data-store/ACME/cprnc/build/cprnc</CCSM_CPRNC>
<SUPPORTED_BY>jgfouca at sandia dot gov</SUPPORTED_BY>
<!-- <GMAKE>make</GMAKE> <- this doesn't actually work! -->
Expand All @@ -424,6 +425,7 @@
<cmd_path lang="sh">module</cmd_path>
<modules>
<command name="purge"/>
<command name="load">git/2.1.3</command>
<command name="load">python/2.7.9</command>
<command name="load">gcc/5.1.0/openmpi/1.8.7</command>
<command name="load">cmake/2.8.12</command>
Expand Down
33 changes: 24 additions & 9 deletions utils/python/CIME/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,6 @@ def _get_procs_needed(self, test, phase, threads_in_flight=None):
else:
return 1


###########################################################################
def _wait_for_something_to_finish(self, threads_in_flight):
###########################################################################
Expand All @@ -613,6 +612,17 @@ def _wait_for_something_to_finish(self, threads_in_flight):
self._procs_avail += procs_needed
del threads_in_flight[finished_test]

###########################################################################
def _update_test_status_file(self, test, test_phase, status):
###########################################################################
"""
In general, test_scheduler should not be responsible for updating
the TestStatus file, but there are a few cases where it has to.
"""
test_dir = self._get_test_dir(test)
with TestStatus(test_dir=test_dir, test_name=test) as ts:
ts.set_status(test_phase, status)

###########################################################################
def _consumer(self, test, test_phase, phase_method):
###########################################################################
Expand All @@ -634,14 +644,9 @@ def _consumer(self, test, test_phase, phase_method):
if test_phase in [CREATE_NEWCASE_PHASE, XML_PHASE, NAMELIST_PHASE]:
# These are the phases for which TestScheduler is reponsible for
# updating the TestStatus file
test_dir = self._get_test_dir(test)

with TestStatus(test_dir=test_dir, test_name=test) as ts:
nl_problem = self._get_test_data(test)[2]
if test_phase == NAMELIST_PHASE and nl_problem:
ts.set_status(test_phase, TEST_FAIL_STATUS)
else:
ts.set_status(test_phase, status)
nl_problem = self._get_test_data(test)[2]
status = TEST_FAIL_STATUS if nl_problem and test_phase == NAMELIST_PHASE else status
self._update_test_status_file(test, test_phase, status)

# On batch systems, we want to immediately submit to the queue, because
# it's very cheap to submit and will get us a better spot in line
Expand Down Expand Up @@ -684,6 +689,16 @@ def _producer(self):
threads_in_flight[test] = (new_thread, procs_needed, next_phase)
new_thread.start()
num_threads_launched_this_iteration += 1
else:
if not threads_in_flight:
msg = "Phase '%s' for test '%s' required more processors, %d, than this machine can provide, %d" % \
(next_phase, test, procs_needed, self._procs_avail)
logger.warning(msg)
self._update_test_status(test, next_phase, TEST_PENDING_STATUS)
self._update_test_status(test, next_phase, TEST_FAIL_STATUS)
self._log_output(test, msg)
self._update_test_status_file(test, next_phase, TEST_FAIL_STATUS)
num_threads_launched_this_iteration += 1

if not work_to_do:
break
Expand Down
5 changes: 3 additions & 2 deletions utils/python/CIME/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def get_model():

_hack=object()
def run_cmd(cmd, input_str=None, from_dir=None, verbose=None,
arg_stdout=_hack, arg_stderr=_hack):
arg_stdout=_hack, arg_stderr=_hack, env=None):
"""
Wrapper around subprocess to make it much more convenient to run shell commands
Expand Down Expand Up @@ -170,7 +170,8 @@ def run_cmd(cmd, input_str=None, from_dir=None, verbose=None,
stdout=arg_stdout,
stderr=arg_stderr,
stdin=stdin,
cwd=from_dir)
cwd=from_dir,
env=env)

output, errput = proc.communicate(input_str)
output = output.strip() if output is not None else output
Expand Down
10 changes: 4 additions & 6 deletions utils/python/tests/scripts_regression_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@

# pragma pylint: disable=protected-access
###############################################################################
def run_cmd_assert_result(test_obj, cmd, from_dir=None, expected_stat=0):
def run_cmd_assert_result(test_obj, cmd, from_dir=None, expected_stat=0, env=None):
###############################################################################
from_dir = os.getcwd() if from_dir is None else from_dir
stat, output, errput = run_cmd(cmd, from_dir=from_dir)
stat, output, errput = run_cmd(cmd, from_dir=from_dir, env=env)
if expected_stat == 0:
expectation = "SHOULD HAVE WORKED, INSTEAD GOT STAT %s" % stat
else:
Expand Down Expand Up @@ -1219,8 +1219,7 @@ def query_var(self, var_name, env, var):
environment = os.environ.copy()
environment.update(env)
environment.update(var)
subprocess.check_output(["gmake", "query", "--directory="+temp_dir],
stderr=subprocess.STDOUT, env=environment)
run_cmd_assert_result(self.parent, "gmake query --directory=%s 2>&1" % temp_dir, env=environment)

with open(output_name, "r") as output:
query_result = output.read().strip()
Expand Down Expand Up @@ -1308,8 +1307,7 @@ def query_var(self, var_name, env, var):

environment = os.environ.copy()
environment.update(env)
subprocess.check_output(["cmake", "."], cwd=temp_dir,
stderr=subprocess.STDOUT, env=environment)
run_cmd_assert_result(self.parent, "cmake . 2>&1", from_dir=temp_dir, env=environment)

with open(output_name, "r") as output:
query_result = output.read().strip()
Expand Down

0 comments on commit 67a8165

Please sign in to comment.