PecanProject · infotroph · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,7 +10,8 @@ For more information about this file see also [Keep a Changelog](http://keepacha
 ### Added
 
 - Documentation of `make` options including addition of `make help` 
-- Add make option to document a single package with `make documentation pathto/package` 
+- Add make option to document a single package with `make documentation pathto/package`
+- `settings$host$qsub` and `settings$host$modellauncher$qsub.extra` will now expand `@NJOBS@` to the number of models in the run, allowing e.g. `--array=1-@NJOBS@`. Note that qsub still by default submits every model as a separate job, so for now this is mostly useful for custom modellauncher scripts
 
 ### Fixed
 - updated github action to build docker images

diff --git a/base/remote/NAMESPACE b/base/remote/NAMESPACE
@@ -15,9 +15,7 @@ export(remote.copy.from)
 export(remote.copy.to)
 export(remote.execute.R)
 export(remote.execute.cmd)
-export(runModule.start.model.runs)
 export(setup_modellauncher)
-export(start.model.runs)
 export(start_qsub)
 export(start_rabbitmq)
 export(start_serial)

diff --git a/base/remote/NEWS.md b/base/remote/NEWS.md
@@ -2,6 +2,10 @@
 
 * PEcAn.remote is now distributed under the BSD 3-clause license instead of the NCSA Open Source license.
 
+## Removed
+
+* `start.model.runs` and `runModule.start.model.runs`, defunct since they were
+  moved to `PEcAn.workflow` in 2021, have been deleted.
 
 # PEcAn.remote 1.8.0
 

diff --git a/base/remote/R/check_model_run.R b/base/remote/R/check_model_run.R
@@ -1,7 +1,7 @@
 #' Check if model run was successful
 #'
 #' @param out Output from model execution, as a character.
-#' @inheritParams start.model.runs
+#' @param stop.on.error Throw error if _any_ of the runs fails. Default TRUE.
 #'
 #' @return `TRUE` if model run succeeded. If model run failed, throw an error if `stop.on.error`, or return FALSE.
 #' @export

diff --git a/base/remote/R/qsub_get_jobid.R b/base/remote/R/qsub_get_jobid.R
@@ -1,7 +1,6 @@
 #' Get Job ID from qsub output
 #'
 #' @inheritParams check_model_run
-#' @inheritParams start.model.runs
 #' @param qsub.jobid (character) Regular expression string for extracting job ID from qsub output.
 #' Usually from `settings$host$qsub.jobid`
 #'

diff --git a/base/remote/R/start.model.runs.R b/base/remote/R/start.model.runs.R
diff --git a/base/remote/R/start_qsub.R b/base/remote/R/start_qsub.R
@@ -19,12 +19,12 @@ start_qsub <- function(run, qsub_string, rundir,
 
   run_id_string <- format(run, scientific = FALSE)
 
+  if (!is.null(qsub_extra)) {
+    qsub_string <- paste(qsub_string, qsub_extra)
+  }
   qsub <- gsub("@NAME@", paste0("PEcAn-", run_id_string), qsub_string)
   qsub <- gsub("@STDOUT@", file.path(host_outdir, run_id_string, stdout_log), qsub)
   qsub <- gsub("@STDERR@", file.path(host_outdir, run_id_string, stderr_log), qsub)
-  if (!is.null(qsub_extra)) {
-    qsub <- paste(qsub, qsub_extra)
-  }
   # NOTE: This converts `qsub` to a list.
   qsub <- strsplit(qsub, " (?=([^\"']*\"[^\"']*\")*[^\"']*$)", perl = TRUE)
 

diff --git a/base/remote/man/start.model.runs.Rd b/base/remote/man/start.model.runs.Rd
diff --git a/base/remote/tests/Rcheck_reference.log b/base/remote/tests/Rcheck_reference.log
diff --git a/base/workflow/NEWS.md b/base/workflow/NEWS.md
@@ -2,6 +2,8 @@
 
 * PEcAn.workflow is now distributed under the BSD 3-clause license instead of the NCSA Open Source license.
 * It is now easier to run a workflow without a connection to the PEcAn database by setting `settings$database$bety$write` to FALSE (or undefining it entirely), at the obvious cost that runs set up this way are not recorded in the database (@yinghaoSunn, #3398).
+* Improved handling of `modellauncher` in `start_model_runs()`,
+  including some support for array runs via settings like `<qsub.extra>-t 1-@NJOBS@</qsub.extra>`.
 
 # PEcAn.workflow 1.8.0
 

diff --git a/base/workflow/R/start_model_runs.R b/base/workflow/R/start_model_runs.R
@@ -39,18 +39,30 @@ start_model_runs <- function(settings, write = TRUE, stop.on.error = TRUE) {
   is_rabbitmq <- !is.null(settings$host$rabbitmq)
   is_modellauncher <- !is.null(settings$host$modellauncher)
 
-  # Check if Njobmax tag exists in seetings
+  # Check if Njobmax tag exists in settings
   if (is_modellauncher){
     if (!is.null(settings$host$modellauncher$Njobmax)){
       Njobmax <- settings$host$modellauncher$Njobmax
     } else {
       Njobmax <- nruns
     }
+    settings$host$modellauncher$qsub.extra <- gsub(
+      "@NJOBS@",
+      Njobmax,
+      settings$host$modellauncher$qsub.extra
+    )
+
     compt_run <- 0
     compt_run_modellauncher <- 1
     job_modellauncher <- list()
+  } else {
+    Njobmax <- nruns
   }
-
+
+  if (is_qsub) {
+    settings$host$qsub <- gsub("@NJOBS@", Njobmax, settings$host$qsub)
+  }
+
   # loop through runs and either call start run, or launch job on remote machine
   jobids <- list()
 
@@ -68,7 +80,6 @@ start_model_runs <- function(settings, write = TRUE, stop.on.error = TRUE) {
 
   # launcher folder
   jobfile <- NULL
-  firstrun <- NULL
 
   #Copy all run directories over if not local
   if (!is_local) {
@@ -115,9 +126,8 @@ start_model_runs <- function(settings, write = TRUE, stop.on.error = TRUE) {
       jobids[run] <- folder
 
     } else if (is_modellauncher) {
-      # set up launcher script if we use modellauncher
-      if (is.null(firstrun)) {
-        firstrun <- run
+      # set up one launcher script for each chunk of up to Njobmax jobs
+      if (is.null(jobfile)) {
         jobfile <- PEcAn.remote::setup_modellauncher(
           run = run,
           rundir = settings$rundir,
@@ -131,6 +141,13 @@ start_model_runs <- function(settings, write = TRUE, stop.on.error = TRUE) {
         c(file.path(settings$host$rundir, run_id_string)),
         con = jobfile)
       pbi <- pbi + 1
+      compt_run <- compt_run + 1
+      # Check if compt_run has reached Njobmax
+      if (compt_run == Njobmax) {
+        close(jobfile)
+        compt_run <- 0
+        jobfile <- NULL
+      }
 
     } else if (is_qsub) {
       out <- PEcAn.remote::start_qsub(
@@ -179,18 +196,6 @@ start_model_runs <- function(settings, write = TRUE, stop.on.error = TRUE) {
       pbi <- pbi + 1
       utils::setTxtProgressBar(pb, pbi)
     }
-
-    # Check if compt_run has reached Njobmax
-    if (is_modellauncher){
-      compt_run <- compt_run + 1
-      if (compt_run == Njobmax){
-        close(jobfile)
-        firstrun <- NULL
-        compt_run <- 0
-        jobfile <- NULL
-      }      
-    }
-
   } # end loop over runs
   close(pb)
 
@@ -231,30 +236,35 @@ start_model_runs <- function(settings, write = TRUE, stop.on.error = TRUE) {
             stdout_log = "launcher.out.log",
             stderr_log = "launcher.err.log",
             job_script = "launcher.sh",
-            qsub_extra = settings$host$modellauncher$qsub)
+            qsub_extra = settings$host$modellauncher$qsub.extra)
         }
         # HACK: Code below gets 'run' from names(jobids) so need an entry for
         # each run. But when using modellauncher all runs have the same jobid
         jobids[run] <- sub(settings$host$qsub.jobid, "\\1", out[length(out)])
       }
 
     } else {
+      pb <- utils::txtProgressBar(min = 0, max = nruns, style = 3)
+      pbi <- 0
+      for (run in job_modellauncher) {
         out <- PEcAn.remote::start_serial(
           run = run,
           host = settings$host,
           rundir = settings$rundir,
           host_rundir = settings$host$rundir,
           job_script = "launcher.sh")
-
-      # check output to see if an error occurred during the model run
-      PEcAn.remote::check_model_run(out = out, stop.on.error = TRUE)
-
+
+        # check output to see if an error occurred during the model run
+        PEcAn.remote::check_model_run(out = out, stop.on.error = TRUE)
+
+        pbi <- pbi + 1
+        utils::setTxtProgressBar(pb, pbi)
+      }
+      close(pb)
       # write finished time to database
       for (run in run_list) {
         PEcAn.DB::stamp_finished(con = dbcon, run = run)
       }
-
-      utils::setTxtProgressBar(pb, pbi)
     }
   }
 
@@ -311,12 +321,14 @@ start_model_runs <- function(settings, write = TRUE, stop.on.error = TRUE) {
 
         # Write finish time to database
         #TODO this repeats for every run in `jobids` writing every run's time stamp every time. This actually takes quite a long time with a lot of ensembles and should either 1) not be a for loop (no `for(x in run_list)`) or 2) if `is_modellauncher`, be done outside of the jobids for loop after all jobs are finished.
-        if (is_modellauncher) {
+        if (is_modellauncher && write) {
           for (x in run_list) {
             PEcAn.DB::stamp_finished(con = dbcon, run = x)
           }
         } else {
-          PEcAn.DB::stamp_finished(con = dbcon, run = run)
+          if (write) {
+            PEcAn.DB::stamp_finished(con = dbcon, run = run)
+          }
         }
 
         # move progress bar

diff --git a/book_source/03_topical_pages/03_pecan_xml.Rmd b/book_source/03_topical_pages/03_pecan_xml.Rmd
@@ -527,10 +527,11 @@ The `host` section has the following tags:
 
 The `modellauncher` section if specified will group all runs together and only submit a single job to the HPC cluster. This single job will leverage of a MPI program that will execute a single run. Some HPC systems will place a limit on the number of jobs that can be executed in parallel, this will only submit a single job (using multiple nodes). In case there is no limit on the number of jobs, a single PEcAn run could potentially submit a lot of jobs resulting in the full cluster running jobs for a single PEcAn run, preventing others from executing on the cluster.
 
-The `modellauncher` has 3 arguments:
+The `modellauncher` section has 4 arguments:
 * `binary` : [required] The full path to the binary modellauncher. Source code for this file can be found in `pecan/contrib/modellauncher`](https://github.com/PecanProject/pecan/tree/develop/contrib/modellauncher).
-* `qsub.extra` : [optional] Additional flags to pass to qsub besides those specified in the `qsub` tag in host. This option can be used to specify that the MPI environment needs to be used and the number of nodes that should be used.
+* `qsub.extra` : [optional] Additional flags to pass to qsub besides those specified in the `qsub` tag in host. This option can be used to specify that the MPI environment needs to be used and the number of nodes that should be used. If it contains the string @NJOBS@ that will be replaced with the smaller of `Njobmax` or the number of jobs in the run (useful for submitting array jobs).
 * `mpirun`: [optional] Additional commands to be added to the front of `launcher.sh`.  Default is `mpirun <path to binary> <path to joblist.txt>`.  Edit this to, for example, load necessary modules to run `mpirun`.
+* `Njobmax`: [optional] Maximum number of jobs to launch from one script. If your run has more jobs than this, it will create multiple launcher scripts.
 
 ## Advanced features {#xml-advanced}
 

diff --git a/modules/assim.batch/R/pda.bayesian.tools.R b/modules/assim.batch/R/pda.bayesian.tools.R
@@ -216,7 +216,7 @@ pda.bayesian.tools <- function(settings, external.data = NULL, external.priors =
                                                                                                              now, sep = "."))
 
     ## Start model run
-    PEcAn.remote::start.model.runs(settings, FALSE)
+    PEcAn.workflow::start_model_runs(settings, write = FALSE)
 
     ## Read model outputs
     align.return <- pda.get.model.output(settings, run.id, NULL, inputs, external.formats)