wait_for_tests will now always specifically wait for the RUN phase

It used to terminate as soon as it saw no phases in the PEND state. This created a race condition between the end of the build phase where no states were in a PEND state but the run phase hadn't happened yet.
E3SM-Project · Aug 9, 2016 · f25a518 · f25a518
1 parent db1538e
commit f25a518
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 31 deletions.
diff --git a/scripts/Tools/wait_for_tests b/scripts/Tools/wait_for_tests
@@ -47,9 +47,6 @@ formatter_class=argparse.ArgumentDefaultsHelpFormatter
     parser.add_argument("-n", "--no-wait", action="store_true",
                         help="Do not wait for tests to finish")
 
-    parser.add_argument("-r", "--wait-for-run", action="store_true",
-                        help="Continue to wait until the RUN phase has passed")
-
     parser.add_argument("-t", "--check-throughput", action="store_true",
                         help="Fail if throughput check fails (fail if tests slow down)")
 
@@ -75,10 +72,7 @@ formatter_class=argparse.ArgumentDefaultsHelpFormatter
 
     CIME.utils.handle_standard_logging_options(args)
 
-    CIME.utils.expect(not (args.no_wait and args.wait_for_run),
-                      "Does not make sense to use --no-wait and --wait-for-run together")
-
-    return args.paths, args.no_wait, args.wait_for_run, args.check_throughput, args.check_memory, args.ignore_namelist_diffs, args.ignore_memleak, args.cdash_build_name, args.cdash_project, args.cdash_build_group
+    return args.paths, args.no_wait, args.check_throughput, args.check_memory, args.ignore_namelist_diffs, args.ignore_memleak, args.cdash_build_name, args.cdash_project, args.cdash_build_group
 
 ###############################################################################
 def _main_func(description):
@@ -87,12 +81,11 @@ def _main_func(description):
         CIME.utils.run_cmd_no_fail("python -m doctest %s/wait_for_tests.py -v" % CIME.utils.get_python_libs_root(), arg_stdout=None, arg_stderr=None)
         return
 
-    test_paths, no_wait, wait_for_run, check_throughput, check_memory, ignore_namelist_diffs, ignore_memleak, cdash_build_name, cdash_project, cdash_build_group = \
+    test_paths, no_wait, check_throughput, check_memory, ignore_namelist_diffs, ignore_memleak, cdash_build_name, cdash_project, cdash_build_group = \
         parse_command_line(sys.argv, description)
 
     sys.exit(0 if CIME.wait_for_tests.wait_for_tests(test_paths,
                                                 no_wait=no_wait,
-                                                wait_for_run=wait_for_run,
                                                 check_throughput=check_throughput,
                                                 check_memory=check_memory,
                                                 ignore_namelists=ignore_namelist_diffs,

diff --git a/utils/python/CIME/test_status.py b/utils/python/CIME/test_status.py
@@ -195,6 +195,14 @@ def get_overall_test_status(self, wait_for_run=False, check_throughput=False, ch
         'PASS'
         >>> _test_helper2('PASS ERS.foo.A MODEL_BUILD', wait_for_run=True)
         'PEND'
+        >>> _test_helper2('FAIL ERS.foo.A MODEL_BUILD', wait_for_run=True)
+        'FAIL'
+        >>> _test_helper2('PASS ERS.foo.A MODEL_BUILD\nPEND ERS.foo.A RUN', wait_for_run=True)
+        'PEND'
+        >>> _test_helper2('PASS ERS.foo.A MODEL_BUILD\nFAIL ERS.foo.A RUN', wait_for_run=True)
+        'FAIL'
+        >>> _test_helper2('PASS ERS.foo.A MODEL_BUILD\nPASS ERS.foo.A RUN', wait_for_run=True)
+        'PASS'
         """
         rv = TEST_PASS_STATUS
         run_phase_found = False

diff --git a/utils/python/CIME/wait_for_tests.py b/utils/python/CIME/wait_for_tests.py
@@ -258,7 +258,7 @@ def create_cdash_xml(results, cdash_build_name, cdash_project, cdash_build_group
     CIME.utils.run_cmd_no_fail("ctest -VV -D NightlySubmit", verbose=True)
 
 ###############################################################################
-def wait_for_test(test_path, results, wait, wait_for_run, check_throughput, check_memory, ignore_namelists, ignore_memleak):
+def wait_for_test(test_path, results, wait, check_throughput, check_memory, ignore_namelists, ignore_memleak):
 ###############################################################################
     if (os.path.isdir(test_path)):
         test_status_filepath = os.path.join(test_path, TEST_STATUS_FILENAME)
@@ -271,7 +271,8 @@ def wait_for_test(test_path, results, wait, wait_for_run, check_throughput, chec
         if (os.path.exists(test_status_filepath)):
             ts = TestStatus(test_dir=os.path.dirname(test_status_filepath))
             test_name = ts.get_name()
-            test_status = ts.get_overall_test_status(wait_for_run=wait_for_run, check_throughput=check_throughput,
+            test_status = ts.get_overall_test_status(wait_for_run=True, # Important
+                                                     check_throughput=check_throughput,
                                                      check_memory=check_memory, ignore_namelists=ignore_namelists,
                                                      ignore_memleak=ignore_memleak)
 
@@ -292,12 +293,12 @@ def wait_for_test(test_path, results, wait, wait_for_run, check_throughput, chec
                 break
 
 ###############################################################################
-def wait_for_tests_impl(test_paths, no_wait=False, wait_for_run=False, check_throughput=False, check_memory=False, ignore_namelists=False, ignore_memleak=False):
+def wait_for_tests_impl(test_paths, no_wait=False, check_throughput=False, check_memory=False, ignore_namelists=False, ignore_memleak=False):
 ###############################################################################
     results = Queue.Queue()
 
     for test_path in test_paths:
-        t = threading.Thread(target=wait_for_test, args=(test_path, results, not no_wait, wait_for_run, check_throughput, check_memory, ignore_namelists, ignore_memleak))
+        t = threading.Thread(target=wait_for_test, args=(test_path, results, not no_wait, check_throughput, check_memory, ignore_namelists, ignore_memleak))
         t.daemon = True
         t.start()
 
@@ -328,7 +329,6 @@ def wait_for_tests_impl(test_paths, no_wait=False, wait_for_run=False, check_thr
 ###############################################################################
 def wait_for_tests(test_paths,
                    no_wait=False,
-                   wait_for_run=False,
                    check_throughput=False,
                    check_memory=False,
                    ignore_namelists=False,
@@ -341,7 +341,7 @@ def wait_for_tests(test_paths,
     # is terminated
     set_up_signal_handlers()
 
-    test_results = wait_for_tests_impl(test_paths, no_wait, wait_for_run, check_throughput, check_memory, ignore_namelists, ignore_memleak)
+    test_results = wait_for_tests_impl(test_paths, no_wait, check_throughput, check_memory, ignore_namelists, ignore_memleak)
 
     all_pass = True
     for test_name, test_data in sorted(test_results.iteritems()):

diff --git a/utils/python/tests/scripts_regression_tests.py b/utils/python/tests/scripts_regression_tests.py
@@ -169,25 +169,24 @@ class D_TestWaitForTests(unittest.TestCase):
     def setUp(self):
     ###########################################################################
         self._testroot = MACHINE.get_value("CESMSCRATCHROOT")
-        self._testdir_all_pass     = os.path.join(self._testroot, 'scripts_regression_tests.testdir_all_pass')
-        self._testdir_with_fail    =  os.path.join(self._testroot, 'scripts_regression_tests.testdir_with_fail')
-        self._testdir_unfinished =  os.path.join(self._testroot, 'scripts_regression_tests.testdir_unfinished')
-        for testdir in self._testdir_all_pass, self._testdir_with_fail, self._testdir_unfinished:
+        self._testdir_all_pass    = os.path.join(self._testroot, 'scripts_regression_tests.testdir_all_pass')
+        self._testdir_with_fail   = os.path.join(self._testroot, 'scripts_regression_tests.testdir_with_fail')
+        self._testdir_unfinished  = os.path.join(self._testroot, 'scripts_regression_tests.testdir_unfinished')
+        self._testdir_unfinished2 = os.path.join(self._testroot, 'scripts_regression_tests.testdir_unfinished2')
+        testdirs = [self._testdir_all_pass, self._testdir_with_fail, self._testdir_unfinished, self._testdir_unfinished2]
+        for testdir in testdirs:
             if os.path.exists(testdir):
                 shutil.rmtree(testdir)
             os.makedirs(testdir)
 
         for r in range(10):
-            os.makedirs(os.path.join(self._testdir_all_pass, str(r)))
-            make_fake_teststatus(os.path.join(self._testdir_all_pass, str(r), "TestStatus"), "Test_%d" % r, "PASS", "RUN")
+            for testdir in testdirs:
+                os.makedirs(os.path.join(testdir, str(r)))
+                make_fake_teststatus(os.path.join(testdir, str(r), "TestStatus"), "Test_%d" % r, "PASS", "RUN")
 
-        for r in range(10):
-            os.makedirs(os.path.join(self._testdir_with_fail, str(r)))
-            make_fake_teststatus(os.path.join(self._testdir_with_fail, str(r), "TestStatus"), "Test_%d" % r, "PASS" if r % 2 == 0 else "FAIL", "SETUP" )
-
-        for r in range(10):
-            os.makedirs(os.path.join(self._testdir_unfinished, str(r)))
-            make_fake_teststatus(os.path.join(self._testdir_unfinished, str(r), "TestStatus"), "Test_%d" % r, "PEND" if r == 5 else "PASS", "COMPARE")
+        make_fake_teststatus(os.path.join(self._testdir_with_fail,   "5", "TestStatus"), "Test_5", "FAIL", "RUN")
+        make_fake_teststatus(os.path.join(self._testdir_unfinished,  "5", "TestStatus"), "Test_5", "PEND", "RUN")
+        make_fake_teststatus(os.path.join(self._testdir_unfinished2, "5", "TestStatus"), "Test_5", "PASS", "MODEL_BUILD")
 
         # Set up proxy if possible
         self._unset_proxy = setup_proxy()
@@ -244,7 +243,7 @@ def test_wait_for_test_all_pass(self):
     ###########################################################################
     def test_wait_for_test_with_fail(self):
     ###########################################################################
-        expected_results = ["PASS" if item % 2 == 0 else "FAIL" for item in range(10)]
+        expected_results = ["FAIL" if item == 5 else "PASS" for item in range(10)]
         self.simple_test(self._testdir_with_fail, expected_results)
 
     ###########################################################################
@@ -264,7 +263,28 @@ def test_wait_for_test_wait(self):
 
         self.assertTrue(run_thread.isAlive(), msg="wait_for_tests should have waited")
 
-        make_fake_teststatus(os.path.join(self._testdir_unfinished, "5", "TestStatus"), "Test_5", "PASS", "RUN")
+        with TestStatus(test_dir=os.path.join(self._testdir_unfinished, "5")) as ts:
+            ts.set_status(RUN_PHASE, TEST_PASS_STATUS)
+
+        run_thread.join(timeout=10)
+
+        self.assertFalse(run_thread.isAlive(), msg="wait_for_tests should have finished")
+
+        self.assertTrue(self._thread_error is None, msg="Thread had failure: %s" % self._thread_error)
+
+    ###########################################################################
+    def test_wait_for_test_wait2(self):
+    ###########################################################################
+        run_thread = threading.Thread(target=self.threaded_test, args=(self._testdir_unfinished2, ["PASS"] * 10))
+        run_thread.daemon = True
+        run_thread.start()
+
+        time.sleep(5) # Kinda hacky
+
+        self.assertTrue(run_thread.isAlive(), msg="wait_for_tests should have waited")
+
+        with TestStatus(test_dir=os.path.join(self._testdir_unfinished2, "5")) as ts:
+            ts.set_status(RUN_PHASE, TEST_PASS_STATUS)
 
         run_thread.join(timeout=10)
 
@@ -889,7 +909,7 @@ def test_single_submit(self):
         self.assertEqual(stat, 0,
                          msg="COMMAND SHOULD HAVE WORKED\ncreate_test output:\n%s\n\nerrput:\n%s\n\ncode: %d" % (output, errput, stat))
 
-        stat, output, errput = run_cmd("%s/wait_for_tests *%s/TestStatus -r" % (TOOLS_DIR, self._baseline_name),
+        stat, output, errput = run_cmd("%s/wait_for_tests *%s/TestStatus" % (TOOLS_DIR, self._baseline_name),
                                        from_dir=self._testroot)
         self.assertEqual(stat, 0,
                          msg="COMMAND SHOULD HAVE WORKED\nwait_for_tests output:\n%s\n\nerrput:\n%s\n\ncode: %d" % (output, errput, stat))