Merge branch 'jgfouca/scripts-acme/create_test_impl_parallel' into ne…

…xt (PR #426) create_test: Implement full parallelism Create_test will now automatically parallelize processing of test cases up to MAX_TASKS_PER_NODE. [BFB]
E3SM-Project · Oct 31, 2015 · 357bb6b · 357bb6b
2 parents c89680f + 7f133b6
commit 357bb6b
Show file tree

Hide file tree

Showing 6 changed files with 264 additions and 125 deletions.
diff --git a/cime/machines-acme/config_machines.xml b/cime/machines-acme/config_machines.xml
@@ -298,7 +298,7 @@
          <BATCHREDIRECT></BATCHREDIRECT>
          <SUPPORTED_BY>jgfouca at sandia dot gov</SUPPORTED_BY>
          <GMAKE_J>4</GMAKE_J>
-         <MAX_TASKS_PER_NODE>8</MAX_TASKS_PER_NODE>
+         <MAX_TASKS_PER_NODE>16</MAX_TASKS_PER_NODE>
 	 <PIO_BUFFER_SIZE_LIMIT>1</PIO_BUFFER_SIZE_LIMIT>
          <PROJECT_REQUIRED>TRUE</PROJECT_REQUIRED>
          <batch_system type="slurm" version="x.y">

diff --git a/cime/scripts-acme/bless_test_results b/cime/scripts-acme/bless_test_results
@@ -148,16 +148,16 @@ def bless_test_results(baseline_name, test_root, compiler, namelists_only=False,
             expect(create_test_impl.NAMELIST_PHASE in test_result,
                    "Test '%s' had no namelist phase" % test_name)
 
-            run_phase_pass = test_result[wait_for_tests.RUN_PHASE] == wait_for_tests.TEST_PASSED_STATUS
-            nl_pass        = test_result[create_test_impl.NAMELIST_PHASE] == wait_for_tests.TEST_PASSED_STATUS
+            run_phase_pass = test_result[wait_for_tests.RUN_PHASE] == wait_for_tests.TEST_PASS_STATUS
+            nl_pass        = test_result[create_test_impl.NAMELIST_PHASE] == wait_for_tests.TEST_PASS_STATUS
 
             if (not run_phase_pass):
                 warning("Test '%s' did not run successfully, it is not safe to bless results" % test_name)
                 time.sleep(2)
             else:
                 expect(wait_for_tests.HIST_COMPARE_PHASE in test_result,
                        "Test '%s' had no history compare phase" % test_name)
-                hist_pass = test_result[wait_for_tests.HIST_COMPARE_PHASE] == wait_for_tests.TEST_PASSED_STATUS
+                hist_pass = test_result[wait_for_tests.HIST_COMPARE_PHASE] == wait_for_tests.TEST_PASS_STATUS
 
                 if ( (nl_pass and hist_pass) or (nl_pass and namelists_only) or (hist_pass and hist_only) ):
                     print "Nothing to bless for test:", test_name, " overall status:", overall_result

diff --git a/cime/scripts-acme/create_test b/cime/scripts-acme/create_test
@@ -111,6 +111,10 @@ formatter_class=argparse.ArgumentDefaultsHelpFormatter
                         "If no testid is specified, then a time stamp will be"
                         "used.")
 
+    parser.add_argument("-j", "--parallel-jobs", type=int, default=None,
+                        help="Number of tasks create_test should perform simultaneously. Default "
+                        "will be min(num_cores, num_tests).")
+
     parser.add_argument("--old", action="store_true", help="Use CIME Perl impl")
 
     args = parser.parse_args(args[1:])
@@ -123,6 +127,9 @@ formatter_class=argparse.ArgumentDefaultsHelpFormatter
            "Provided baseline name but did not specify compare or generate")
     expect(not (args.namelists_only and not (args.generate or args.compare)),
            "Must provide either --compare or --generate with --namelists-only")
+    if (args.parallel_jobs is not None):
+        expect(args.parallel_jobs > 0,
+               "Invalid value for parallel_jobs: %d" % args.parallel_jobs)
 
     if (args.no_build):
         args.no_run = True
@@ -163,7 +170,7 @@ formatter_class=argparse.ArgumentDefaultsHelpFormatter
         args.test_id = acme_util.get_utc_timestamp()
 
     return args.testargs, args.compiler, args.no_run, args.no_build, args.no_batch, args.test_root, args.baseline_root, \
-        args.clean, args.compare, args.generate, args.baseline_name, args.namelists_only, args.project, args.test_id, args.old
+        args.clean, args.compare, args.generate, args.baseline_name, args.namelists_only, args.project, args.test_id, args.old, args.parallel_jobs
 
 ###############################################################################
 def get_tests_from_args(testargs, machine, compiler):
@@ -205,12 +212,15 @@ def get_tests_from_args(testargs, machine, compiler):
 ###############################################################################
 def create_test(testargs, compiler, no_run, no_build, no_batch, test_root,
                 baseline_root, clean, compare, generate,
-                baseline_name, namelists_only, project, test_id, old):
+                baseline_name, namelists_only, project, test_id, old, parallel_jobs):
 ###############################################################################
     machine = acme_util.probe_machine_name()
 
     tests_to_run = get_tests_from_args(testargs, machine, compiler)
 
+    if (parallel_jobs is None):
+        parallel_jobs = min(len(tests_to_run), int(acme_util.get_machine_info("MAX_TASKS_PER_NODE")))
+
     expect(len(tests_to_run) > 0, "No tests to run")
 
     if (not old):
@@ -220,7 +230,7 @@ def create_test(testargs, compiler, no_run, no_build, no_batch, test_root,
                                            baseline_root, baseline_name,
                                            clean,
                                            compare, generate, namelists_only,
-                                           project)
+                                           project, parallel_jobs)
         return 0 if impl.create_test() else 1
     else:
 
@@ -274,11 +284,11 @@ def _main_func(description):
     acme_util.stop_buffering_output()
 
     testargs, compiler, no_run, no_build, no_batch, test_root, baseline_root, clean, \
-        compare, generate, baseline_name, namelists_only, project, test_id, old = \
+        compare, generate, baseline_name, namelists_only, project, test_id, old, parallel_jobs = \
         parse_command_line(sys.argv, description)
 
     sys.exit(create_test(testargs, compiler, no_run, no_build, no_batch, test_root, baseline_root, clean,
-                         compare, generate, baseline_name, namelists_only, project, test_id, old))
+                         compare, generate, baseline_name, namelists_only, project, test_id, old, parallel_jobs))
 
 ###############################################################################