Get new sandia desktop machine 'climate' running scripts_regression_t…

…ests Also: 1) Change test_schedule to not hang forever if a test asks for more processes than exist in the proc pool. 2) Change CMakeTester and MakeTester to not throw exceptions when command fail. Instead, make a test-assert failure with a nice error message. This made it much easier to diagnose failures in these tests on climate.
E3SM-Project · Sep 2, 2016 · 67a8165 · 67a8165
1 parent 4bcb7d8
commit 67a8165
Show file tree

Hide file tree

Showing 7 changed files with 74 additions and 19 deletions.
diff --git a/cime_config/acme/allactive/config_pes.xml.cime2 b/cime_config/acme/allactive/config_pes.xml.cime2
@@ -151,6 +151,10 @@
     <NTASKS_ATM>64</NTASKS_ATM> <NTHRDS_ATM>1</NTHRDS_ATM>   <ROOTPE_ATM>0</ROOTPE_ATM>      <NINST_ATM>1</NINST_ATM>
 </pes>
 
+<pes GRID="a%ne30np4" MACH="sandia-srn-sems">
+    <NTASKS_ATM>64</NTASKS_ATM> <NTHRDS_ATM>1</NTHRDS_ATM>   <ROOTPE_ATM>0</ROOTPE_ATM>      <NINST_ATM>1</NINST_ATM>
+</pes>
+
 <pes GRID="a%ne30np4" MACH="edison">
     <NTASKS_ATM>960</NTASKS_ATM> <NTHRDS_ATM>4</NTHRDS_ATM>   <ROOTPE_ATM>0</ROOTPE_ATM>      <NINST_ATM>1</NINST_ATM>
     <PIO_NUMTASKS>-1</PIO_NUMTASKS><PIO_STRIDE>24</PIO_STRIDE> <PIO_TYPENAME>pnetcdf</PIO_TYPENAME> <PIO_ROOT>1</PIO_ROOT>

diff --git a/cime_config/acme/allactive/config_pesall.xml b/cime_config/acme/allactive/config_pesall.xml
@@ -443,6 +443,41 @@
         </rootpe>
       </pes>
     </mach>
+    <mach name="sandia-srn-sems">
+      <pes compset="any" pesize="any">
+        <comment>none</comment>
+        <ntasks>
+          <ntasks_atm>64</ntasks_atm>
+          <ntasks_lnd>64</ntasks_lnd>
+          <ntasks_rof>64</ntasks_rof>
+          <ntasks_ice>64</ntasks_ice>
+          <ntasks_ocn>64</ntasks_ocn>
+          <ntasks_glc>64</ntasks_glc>
+          <ntasks_wav>64</ntasks_wav>
+          <ntasks_cpl>64</ntasks_cpl>
+        </ntasks>
+        <nthrds>
+          <nthrds_atm>1</nthrds_atm>
+          <nthrds_lnd>1</nthrds_lnd>
+          <nthrds_rof>1</nthrds_rof>
+          <nthrds_ice>1</nthrds_ice>
+          <nthrds_ocn>1</nthrds_ocn>
+          <nthrds_glc>1</nthrds_glc>
+          <nthrds_wav>1</nthrds_wav>
+          <nthrds_cpl>1</nthrds_cpl>
+        </nthrds>
+        <rootpe>
+          <rootpe_atm>0</rootpe_atm>
+          <rootpe_lnd>0</rootpe_lnd>
+          <rootpe_rof>0</rootpe_rof>
+          <rootpe_ice>0</rootpe_ice>
+          <rootpe_ocn>0</rootpe_ocn>
+          <rootpe_glc>0</rootpe_glc>
+          <rootpe_wav>0</rootpe_wav>
+          <rootpe_cpl>0</rootpe_cpl>
+        </rootpe>
+      </pes>
+    </mach>
   </grid>
   <grid name="a%ne30np4">
     <mach name="edison">

diff --git a/cime_config/acme/machines/config_compilers.xml b/cime_config/acme/machines/config_compilers.xml
@@ -619,7 +619,7 @@ for mct, etc.
   <CONFIG_ARGS> --host=Linux </CONFIG_ARGS>
   <NETCDF_PATH>$(NETCDFROOT)</NETCDF_PATH>
   <PNETCDF_PATH>$(PNETCDFROOT)</PNETCDF_PATH>
-  <ADD_SLIBS> $(shell $(NETCDF_PATH)/bin/nf-config --flibs) -lblas -llapack</ADD_SLIBS>
+  <ADD_SLIBS> $(shell $(NETCDF_PATH)/bin/nf-config --flibs) -L/usr/lib64 -L/usr/lib64/atlas -lblas -llapack</ADD_SLIBS>
   <CXX_LIBS>-lstdc++ -lmpi_cxx</CXX_LIBS>
   <ALBANY_PATH>/projects/install/rhel6-x86_64/ACME/AlbanyTrilinos/Albany/build/install</ALBANY_PATH>
 </compiler>

diff --git a/cime_config/acme/machines/config_machines.xml b/cime_config/acme/machines/config_machines.xml
@@ -386,7 +386,7 @@
 
 <machine MACH="sandia-srn-sems">
     <DESC>Linux workstation at Sandia on SRN with SEMS TPL modules</DESC>
-    <NODENAME_REGEX>s999964</NODENAME_REGEX>
+    <NODENAME_REGEX>(s999964|climate)</NODENAME_REGEX>
     <PROXY>wwwproxy.sandia.gov:80</PROXY>
     <TESTS>acme_developer</TESTS>
     <OS>LINUX</OS>
@@ -400,6 +400,7 @@
     <DOUT_S_ROOT>$CESMSCRATCHROOT/archive/$CASE</DOUT_S_ROOT>
     <DOUT_L_MSROOT>csm/$CASE</DOUT_L_MSROOT>
     <CCSM_BASELINE>/sems-data-store/ACME/baselines</CCSM_BASELINE>
+    <SAVE_TIMING_DIR>/sems-data-store/ACME/timings</SAVE_TIMING_DIR>
     <CCSM_CPRNC>/sems-data-store/ACME/cprnc/build/cprnc</CCSM_CPRNC>
     <SUPPORTED_BY>jgfouca at sandia dot gov</SUPPORTED_BY>
 <!--    <GMAKE>make</GMAKE> <- this doesn't actually work! -->
@@ -424,6 +425,7 @@
       <cmd_path lang="sh">module</cmd_path>
       <modules>
 	<command name="purge"/>
+        <command name="load">git/2.1.3</command>
 	<command name="load">python/2.7.9</command>
 	<command name="load">gcc/5.1.0/openmpi/1.8.7</command>
 	<command name="load">cmake/2.8.12</command>

diff --git a/utils/python/CIME/test_scheduler.py b/utils/python/CIME/test_scheduler.py
@@ -595,7 +595,6 @@ def _get_procs_needed(self, test, phase, threads_in_flight=None):
         else:
             return 1
 
-
     ###########################################################################
     def _wait_for_something_to_finish(self, threads_in_flight):
     ###########################################################################
@@ -613,6 +612,17 @@ def _wait_for_something_to_finish(self, threads_in_flight):
             self._procs_avail += procs_needed
             del threads_in_flight[finished_test]
 
+    ###########################################################################
+    def _update_test_status_file(self, test, test_phase, status):
+    ###########################################################################
+        """
+        In general, test_scheduler should not be responsible for updating
+        the TestStatus file, but there are a few cases where it has to.
+        """
+        test_dir = self._get_test_dir(test)
+        with TestStatus(test_dir=test_dir, test_name=test) as ts:
+            ts.set_status(test_phase, status)
+
     ###########################################################################
     def _consumer(self, test, test_phase, phase_method):
     ###########################################################################
@@ -634,14 +644,9 @@ def _consumer(self, test, test_phase, phase_method):
         if test_phase in [CREATE_NEWCASE_PHASE, XML_PHASE, NAMELIST_PHASE]:
             # These are the phases for which TestScheduler is reponsible for
             # updating the TestStatus file
-            test_dir = self._get_test_dir(test)
-
-            with TestStatus(test_dir=test_dir, test_name=test) as ts:
-                nl_problem = self._get_test_data(test)[2]
-                if test_phase == NAMELIST_PHASE and nl_problem:
-                    ts.set_status(test_phase, TEST_FAIL_STATUS)
-                else:
-                    ts.set_status(test_phase, status)
+            nl_problem = self._get_test_data(test)[2]
+            status = TEST_FAIL_STATUS if nl_problem and test_phase == NAMELIST_PHASE else status
+            self._update_test_status_file(test, test_phase, status)
 
         # On batch systems, we want to immediately submit to the queue, because
         # it's very cheap to submit and will get us a better spot in line
@@ -684,6 +689,16 @@ def _producer(self):
                             threads_in_flight[test] = (new_thread, procs_needed, next_phase)
                             new_thread.start()
                             num_threads_launched_this_iteration += 1
+                        else:
+                            if not threads_in_flight:
+                                msg = "Phase '%s' for test '%s' required more processors, %d, than this machine can provide, %d" % \
+                                    (next_phase, test, procs_needed, self._procs_avail)
+                                logger.warning(msg)
+                                self._update_test_status(test, next_phase, TEST_PENDING_STATUS)
+                                self._update_test_status(test, next_phase, TEST_FAIL_STATUS)
+                                self._log_output(test, msg)
+                                self._update_test_status_file(test, next_phase, TEST_FAIL_STATUS)
+                                num_threads_launched_this_iteration += 1
 
             if not work_to_do:
                 break

diff --git a/utils/python/CIME/utils.py b/utils/python/CIME/utils.py
@@ -142,7 +142,7 @@ def get_model():
 
 _hack=object()
 def run_cmd(cmd, input_str=None, from_dir=None, verbose=None,
-            arg_stdout=_hack, arg_stderr=_hack):
+            arg_stdout=_hack, arg_stderr=_hack, env=None):
     """
     Wrapper around subprocess to make it much more convenient to run shell commands
 
@@ -170,7 +170,8 @@ def run_cmd(cmd, input_str=None, from_dir=None, verbose=None,
                             stdout=arg_stdout,
                             stderr=arg_stderr,
                             stdin=stdin,
-                            cwd=from_dir)
+                            cwd=from_dir,
+                            env=env)
 
     output, errput = proc.communicate(input_str)
     output = output.strip() if output is not None else output

diff --git a/utils/python/tests/scripts_regression_tests.py b/utils/python/tests/scripts_regression_tests.py
@@ -31,10 +31,10 @@
 
 # pragma pylint: disable=protected-access
 ###############################################################################
-def run_cmd_assert_result(test_obj, cmd, from_dir=None, expected_stat=0):
+def run_cmd_assert_result(test_obj, cmd, from_dir=None, expected_stat=0, env=None):
 ###############################################################################
     from_dir = os.getcwd() if from_dir is None else from_dir
-    stat, output, errput = run_cmd(cmd, from_dir=from_dir)
+    stat, output, errput = run_cmd(cmd, from_dir=from_dir, env=env)
     if expected_stat == 0:
         expectation = "SHOULD HAVE WORKED, INSTEAD GOT STAT %s" % stat
     else:
@@ -1219,8 +1219,7 @@ def query_var(self, var_name, env, var):
         environment = os.environ.copy()
         environment.update(env)
         environment.update(var)
-        subprocess.check_output(["gmake", "query", "--directory="+temp_dir],
-                                stderr=subprocess.STDOUT, env=environment)
+        run_cmd_assert_result(self.parent, "gmake query --directory=%s 2>&1" % temp_dir, env=environment)
 
         with open(output_name, "r") as output:
             query_result = output.read().strip()
@@ -1308,8 +1307,7 @@ def query_var(self, var_name, env, var):
 
         environment = os.environ.copy()
         environment.update(env)
-        subprocess.check_output(["cmake", "."], cwd=temp_dir,
-                                stderr=subprocess.STDOUT, env=environment)
+        run_cmd_assert_result(self.parent, "cmake . 2>&1", from_dir=temp_dir, env=environment)
 
         with open(output_name, "r") as output:
             query_result = output.read().strip()