From 844c441c9d825d2bbf79a48b58195552f5051192 Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Fri, 21 Aug 2020 09:32:39 -0400 Subject: [PATCH 01/21] Add first version of egs-parallel scripts --- HEN_HOUSE/scripts/egs-parallel-clean | 58 +++++++ HEN_HOUSE/scripts/egs-parallel-dshtask | 203 +++++++++++++++++++++++++ HEN_HOUSE/scripts/egs-parallel-pbsdsh | 93 +++++++++++ HEN_HOUSE/scripts/egs-parallel-run | 162 ++++++++++++++++++++ 4 files changed, 516 insertions(+) create mode 100755 HEN_HOUSE/scripts/egs-parallel-clean create mode 100755 HEN_HOUSE/scripts/egs-parallel-dshtask create mode 100755 HEN_HOUSE/scripts/egs-parallel-pbsdsh create mode 100755 HEN_HOUSE/scripts/egs-parallel-run diff --git a/HEN_HOUSE/scripts/egs-parallel-clean b/HEN_HOUSE/scripts/egs-parallel-clean new file mode 100755 index 000000000..626d7f3a5 --- /dev/null +++ b/HEN_HOUSE/scripts/egs-parallel-clean @@ -0,0 +1,58 @@ +#!/bin/bash + +# usage display +usage() { + echo -e "\nPlease provide at least one simulation basename." + echo -e "\nUSAGE:\n$(basename $0) [ basename1 basename2 ... ]\n" +} + +# check if there is at least one argument +if [ $# -lt 1 ]; then + usage + exit 1 +fi + +# simulation base names +names=("$@") + +# loop over all names +for name in "${names[@]}"; do + + # echo current name + echo "CLEANING ${name} ..." + + # loop over file extensions + for ext in lock eo mederr egsjob; do + + # remove files + count=$( find -maxdepth 1 -name "${name}.${ext}" | wc -l ) + if [ $count -gt 0 ]; then + for f in ${name}.${ext}; do + (set -x; /bin/rm $f) + done + fi + done + + # remove parallel pbs directory + count=$( find -maxdepth 1 -name "${name}.pbs*" | wc -l ) + if [ $count -gt 0 ]; then + for f in ${name}.pbs*; do + (set -x; /bin/rm -r $f) + done + fi + + # remove work files + count=$( find -maxdepth 1 -name "${name}_w*" | wc -l ) + if [ $count -gt 0 ]; then + echo "+ /bin/rm ${name}_w*" + /bin/rm ${name}_w* + fi + + # remove egsrun directories + count=$( find -maxdepth 1 -name "egsrun_*_${name}_*" | wc -l ) + if [ $count -gt 0 ]; then + echo "+ /bin/rm egsrun_*_${name}_*" + /bin/rm -r egsrun_*_${name}_* + fi + +done diff --git a/HEN_HOUSE/scripts/egs-parallel-dshtask b/HEN_HOUSE/scripts/egs-parallel-dshtask new file mode 100755 index 000000000..e638579af --- /dev/null +++ b/HEN_HOUSE/scripts/egs-parallel-dshtask @@ -0,0 +1,203 @@ +#!/bin/bash + +### help function +function help { + log "HELP" + cat < $f + job=$((job+1)) + done +fi + +### wait for my job index (which is printed in task file by manager job) +delta=2 +while [ ! -s $taskfile ]; do + log "$taskstr: wait $delta seconds for job index" + sleep $delta +done + +### job index +job=$(cat $taskfile) +jobstr=$(printf "job %04d" $job) +log "$jobstr <- $taskstr" + +### job label +jobstr=$(printf "job %04d" $job) + +### manage jobs to avoid bottleneck and race conditions +if [ $job -eq 1 ]; then + + # log host and pid of job 1 in .egsjob file + log "$jobstr: host=$(hostname) BEGIN pid=$$" > $basename.egsjob + +else + + # log the host and pid of this job + log "$jobstr: host=$(hostname) pid=$$" + + # all jobs wait a fixed delay (relative to first job) + delta=1 + log "$jobstr: wait $delta seconds (initial delay)" + sleep $delta + + # wait until there is an .egsjob file (maximum 120 seconds) + total=0 + delta=10 + limit=120 + while [ ! -e $basename.egsjob ]; do + log "$jobstr: wait $delta seconds (no $basename.egsjob file after $total seconds)" + sleep $delta + total=$((total+$delta)) + if [ $total -gt $limit ]; then + log "$jobstr: QUIT (no $basename.egsjob file after $limit seconds)" + exit + fi + done + + # sleep until there is a lock file, maximum 300 seconds + total=0 + delta=10 + limit=300 + while [ ! -e $basename.lock ]; do + + # quit if simulation is already done + quit_if_done + + # otherwise wait for lock file + log "$jobstr: wait $delta seconds ($basename.lock not found after $total seconds)" + sleep $delta + total=$((total+$delta)) + if [ $total -gt $limit ]; then + log "$jobstr: QUIT ($basename.lock not found after $limit seconds)" + exit + fi + done + + # offset all jobs by a fixed delay (relative to previous job) + delta=100000 + log "$jobstr: wait $((job*$delta)) microseconds (default job offset delay)" + for j in $(seq 1 $job); do + usleep $delta + quit_if_done + done + + # extra user-specified delay between each job + delta=$delay + if [ $delta -gt 0 ]; then + log "$jobstr: wait $((job*$delta)) seconds (user job offset delay)" + for j in $(seq 1 $job); do + sleep $delta + quit_if_done + done + fi + + # report on lock file content + if [ -r $basename.lock ]; then + content=$(cat $basename.lock) + log "$jobstr: found $basename.lock: $content" + else + log "$jobstr: QUIT ($basename.lock does not exist or is not readable)" + exit + fi + +fi + +### launch the job if simulation is not done already +quit_if_done +log "$jobstr: RUN $command -j $job" +source $HEN_HOUSE/scripts/egsnrc_bashrc_additions +$command -j $job + +### report that this job is done +log "$jobstr: DONE." + +### report that the simulation (job 1) is done +if [ $job -eq 1 ]; then + log "$jobstr: host=$(hostname) END pid=$$" >> $basename.egsjob + log "$jobstr: FINISH simulation" +fi \ No newline at end of file diff --git a/HEN_HOUSE/scripts/egs-parallel-pbsdsh b/HEN_HOUSE/scripts/egs-parallel-pbsdsh new file mode 100755 index 000000000..b7cd2602c --- /dev/null +++ b/HEN_HOUSE/scripts/egs-parallel-pbsdsh @@ -0,0 +1,93 @@ +#!/bin/bash + +### help function +function help { + log "HELP" + cat <&1) +if ! [[ -z $err ]]; then + quit $LINENO "$err" +fi + +### launch pbsdsh tasks +jobid=$(qsub -q $queue $scheduler_options < Date: Mon, 24 Aug 2020 19:13:02 -0400 Subject: [PATCH 02/21] Improve egs-parallel scripts Save the egs-parallel log inside an *.egsparallel file in the application directory, and add a verbosity option (-v) to also echo the log to screen. By default the scripts proceed silently, unless an error condition arises, which is always echoed to the terminal. --- HEN_HOUSE/scripts/egs-parallel-clean | 8 ---- HEN_HOUSE/scripts/egs-parallel-dshtask | 51 +++++++++++++------------- HEN_HOUSE/scripts/egs-parallel-pbsdsh | 50 +++++++++++++++---------- 3 files changed, 57 insertions(+), 52 deletions(-) diff --git a/HEN_HOUSE/scripts/egs-parallel-clean b/HEN_HOUSE/scripts/egs-parallel-clean index 626d7f3a5..8c6e8d9ab 100755 --- a/HEN_HOUSE/scripts/egs-parallel-clean +++ b/HEN_HOUSE/scripts/egs-parallel-clean @@ -33,14 +33,6 @@ for name in "${names[@]}"; do fi done - # remove parallel pbs directory - count=$( find -maxdepth 1 -name "${name}.pbs*" | wc -l ) - if [ $count -gt 0 ]; then - for f in ${name}.pbs*; do - (set -x; /bin/rm -r $f) - done - fi - # remove work files count=$( find -maxdepth 1 -name "${name}_w*" | wc -l ) if [ $count -gt 0 ]; then diff --git a/HEN_HOUSE/scripts/egs-parallel-dshtask b/HEN_HOUSE/scripts/egs-parallel-dshtask index e638579af..8eca7c174 100755 --- a/HEN_HOUSE/scripts/egs-parallel-dshtask +++ b/HEN_HOUSE/scripts/egs-parallel-dshtask @@ -3,11 +3,11 @@ ### help function function help { log "HELP" - cat < $f @@ -101,13 +102,13 @@ while [ ! -s $taskfile ]; do sleep $delta done -### job index +### job index and label job=$(cat $taskfile) jobstr=$(printf "job %04d" $job) log "$jobstr <- $taskstr" -### job label -jobstr=$(printf "job %04d" $job) +# log the host and pid of this job +log "$jobstr: host=$(hostname) BEGIN pid=$$" ### manage jobs to avoid bottleneck and race conditions if [ $job -eq 1 ]; then @@ -117,9 +118,6 @@ if [ $job -eq 1 ]; then else - # log the host and pid of this job - log "$jobstr: host=$(hostname) pid=$$" - # all jobs wait a fixed delay (relative to first job) delta=1 log "$jobstr: wait $delta seconds (initial delay)" @@ -139,6 +137,9 @@ else fi done + # quit if simulation is already done + quit_if_done + # sleep until there is a lock file, maximum 300 seconds total=0 delta=10 @@ -189,12 +190,12 @@ fi ### launch the job if simulation is not done already quit_if_done -log "$jobstr: RUN $command -j $job" source $HEN_HOUSE/scripts/egsnrc_bashrc_additions -$command -j $job +log "$jobstr: RUN $command -b -P $ncore-j $job" +$command b -P $ncore -j $job ### report that this job is done -log "$jobstr: DONE." +log "$jobstr: host=$(hostname) END pid=$$" ### report that the simulation (job 1) is done if [ $job -eq 1 ]; then diff --git a/HEN_HOUSE/scripts/egs-parallel-pbsdsh b/HEN_HOUSE/scripts/egs-parallel-pbsdsh index b7cd2602c..c4a30edf3 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbsdsh +++ b/HEN_HOUSE/scripts/egs-parallel-pbsdsh @@ -3,11 +3,11 @@ ### help function function help { log "HELP" - cat <&3 + if [ "$verbosity" = "verbose" ]; then + printf "$msg" + fi } -### quit function (with optional command as $2) +### quit function function quit { lineno=$1 msg=$2 case $3 in - help) run="help";; - *) run="";; + help) cmd="help";; + *) cmd="";; esac - log "$(basename $0) line $lineno: $msg"; $run; log "QUIT."; exit 1 + verbosity="verbose" + log "$0 line $lineno: $msg"; $cmd; log "QUIT."; exit 1 } -### begin script -log "BEGIN $0" - ### parse command-line arguments (simplistic) args_min=6 if [ "$#" -lt $args_min ]; then + exec 3>/dev/null quit $LINENO "only $# arguments provided; at least $args_min required" help fi queue=$1 @@ -56,9 +60,13 @@ first=$4 basename=$5 command=$6 scheduler_options=$7 +verbosity=$8 -### add parallel flags to egs command -command="$command -b -P $ncore" +### link file descriptor 3 to egs-parallel log file +exec 3>>$basename.egsparallel + +### begin script +log "BEGIN $0" ### set scheduler job name (maximum 14 characters) jobname=$(echo ${basename}[$ncore]) @@ -71,23 +79,27 @@ log "job name: $jobname" ### create pbsdsh directory to store task files for job numbers pbsdsh_dir=$basename.pbsdsh log "create temporary directory $pbsdsh_dir" +if [ -e $pbsdsh_dir ]; then + /bin/rm -r $pbsdsh_dir +fi err=$(mkdir $pbsdsh_dir 2>&1) if ! [[ -z $err ]]; then quit $LINENO "$err" fi - ### launch pbsdsh tasks -jobid=$(qsub -q $queue $scheduler_options < Date: Mon, 24 Aug 2020 20:36:17 -0400 Subject: [PATCH 03/21] Add egs-parallel sub-script for standard pbs jobs --- HEN_HOUSE/scripts/egs-parallel-pbs | 206 +++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100755 HEN_HOUSE/scripts/egs-parallel-pbs diff --git a/HEN_HOUSE/scripts/egs-parallel-pbs b/HEN_HOUSE/scripts/egs-parallel-pbs new file mode 100755 index 000000000..1c7e11d5e --- /dev/null +++ b/HEN_HOUSE/scripts/egs-parallel-pbs @@ -0,0 +1,206 @@ +#!/bin/bash + +### help function +function help { + log "HELP" + cat <&3 + if [ "$verbosity" = "verbose" ]; then + printf "$msg" + fi +} + +### quit function +function quit { + lineno=$1 + msg=$2 + case $3 in + help) cmd="help";; + *) cmd="";; + esac + verbosity="verbose" + log "$0 line $lineno: $msg"; $cmd; log "QUIT."; exit 1 +} + +### quit function if simulation is done +function quit_if_done { + if [ -r $basename.egsjob ]; then + done=$(grep -o END $basename.egsjob) + if [ "$done" = "END" ]; then + log "$jobstr: QUIT (simulation already finished)" + exit + fi + fi +} + +### begin script +log "BEGIN $0" + +### parse command-line arguments (simplistic) +args_min=6 +if [ "$#" -lt $args_min ]; then + quit $LINENO "only $# arguments provided; at least $args_min required" help +fi +queue=$1 +ncore=$2 +delay=$3 +first=$4 +basename=$5 +command=$6 +scheduler_options=$7 +verbosity=$8 + +### link file descriptor 3 to egs-parallel log file +exec 3>>$basename.egsparallel + +### set scheduler job name (maximum 14 characters) +jobname=$(echo ${basename}[$ncore]) +trim=$(( $(echo $jobname | wc -c) - 14 )) +if [ $trim -gt 0 ]; then + jobname=$(echo $jobname | cut -c $trim-) +fi +log "job name: $jobname" + +### loop to launch ncore pbs jobs +for job in $(seq 1 $ncore); do + + # job label + jobstr=$(printf "job %04d" $job) + + ### manage jobs to avoid bottleneck and race conditions + if [ $job -gt 1 ]; then + + # job 2 does all the waiting + delta=2 + if [ $job -eq 2 ]; then + + # wait a fixed delay (relative to first job) + log "$jobstr: wait $delta seconds (initial delay)" + sleep $delta + + # wait until there is an .egsjob file (maximum 120 seconds) + total=0 + delta=10 + limit=120 + while [ ! -e $basename.egsjob ]; do + log "$jobstr: wait $delta seconds (no $basename.egsjob file after $total seconds)" + sleep $delta + total=$((total+$delta)) + if [ $total -gt $limit ]; then + log "$jobstr: QUIT (no $basename.egsjob file after $limit seconds)" + exit + fi + done + + # quit if simulation is already done + quit_if_done + + # sleep until there is a lock file, maximum 300 seconds + total=0 + delta=10 + limit=300 + while [ ! -e $basename.lock ]; do + + # quit if simulation is already done + quit_if_done + + # otherwise wait for lock file + log "$jobstr: wait $delta seconds ($basename.lock not found after $total seconds)" + sleep $delta + total=$((total+$delta)) + if [ $total -gt $limit ]; then + log "$jobstr: QUIT ($basename.lock not found after $limit seconds)" + exit + fi + done + fi + + # quit if simulation is already done + quit_if_done + + # offset all jobs by a fixed delay (relative to previous job) + delta=500000 + log "$jobstr: wait $delta microseconds (default job offset delay)" + usleep $delta + + # extra user-specified delay between each job + delta=$delay + if [ $delta -gt 0 ]; then + log "$jobstr: wait $delta seconds (user job offset delay)" + sleep $delta + fi + + # report on lock file content + if [ -r $basename.lock ]; then + content=$(cat $basename.lock) + log "$jobstr: found $basename.lock: $content" + else + log "$jobstr: QUIT ($basename.lock does not exist or is not readable)" + exit + fi + fi + + ### launch the job if simulation is not done already + quit_if_done + log "$jobstr: RUN $command -b -P $ncore -j $job" + source $HEN_HOUSE/scripts/egsnrc_bashrc_additions + jobid=$(qsub -q $queue $scheduler_options < \$PBS_O_WORKDIR/$basename.egsjob +fi + +### run command +$command -b -P $ncore -j $job + +### log completion +log "$jobstr: PBS host=\$(hostname) END pid=\$\$" +if [ $job -eq 1 ]; then + log "$jobstr: host=\$(hostname) END pid=\$\$" >> \$PBS_O_WORKDIR/$basename.egsjob + log "$jobstr: FINISH simulation" +fi +EOF + ) + echo $jobid + +done From 08d17f27091f2422fb874452236a9c4f74e0c84e Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Tue, 25 Aug 2020 17:51:56 -0400 Subject: [PATCH 04/21] Improve top-level egs-parallel script Notably, save log message to a log file, add a verbosity option (-v), and allow joined single-letter options and argument (without a space between the option and its argument, as in "-n123"). --- .../{egs-parallel-run => egs-parallel} | 93 +++++++++++++------ 1 file changed, 67 insertions(+), 26 deletions(-) rename HEN_HOUSE/scripts/{egs-parallel-run => egs-parallel} (69%) diff --git a/HEN_HOUSE/scripts/egs-parallel-run b/HEN_HOUSE/scripts/egs-parallel similarity index 69% rename from HEN_HOUSE/scripts/egs-parallel-run rename to HEN_HOUSE/scripts/egs-parallel index aacab1e33..ea1442461 100755 --- a/HEN_HOUSE/scripts/egs-parallel-run +++ b/HEN_HOUSE/scripts/egs-parallel @@ -3,7 +3,7 @@ ### help function function help { log "HELP" - cat <&3 + if [ "$verbosity" = "verbose" ]; then + printf "$msg" + fi } -### quit function (with optional command as $2) +### quit function for errors, with source, line, message and command function quit { lineno=$1 msg=$2 case $3 in - help) run="help";; - *) run="";; + help) cmd="help";; + *) cmd="";; esac - log "$(basename $0) line $lineno: $msg"; $run; log "QUIT."; exit 1 + verbosity="verbose" + log "$0 line $lineno: $msg"; $cmd; log "QUIT."; exit 1 } -### begin script -log "BEGIN $0" - -### show EGSnrc environment variables -log "EGSnrc environment:" -log " HEN_HOUSE = $HEN_HOUSE" -log " EGS_HOME = $EGS_HOME" -log " EGS_CONFIG = $EGS_CONFIG" -hen_house=${HEN_HOUSE%/} -egs_home=${EGS_HOME%/} +### link file descriptor 3 to egs-parallel log file +exec 3>egs-parallel-$$.log ### default option values opt_batch="pbsdsh" @@ -55,31 +58,61 @@ opt_queue="long" opt_ncore="8" opt_delay="0" opt_command="" -opt_option="" +opt_options="" +verbosity="silent" +declare -a opt_options_array -### parse command-line options (simplistic) +### parse command-line options while [ "$#" -gt 0 ]; do + + # consume next command-line token opt=$1; shift - arg=$1; shift + + # options without arguments case $opt in -h|--help) help; exit;; + -v|--verbose) verbosity="verbose"; continue;; + esac + + # allow joined single-letter options and argument (no space in between) + if [ -n "${opt:2}" ] && [ "${opt:0:1}" = "-" ] && ! [ "${opt:0:2}" = "--" ]; then + arg="${opt:2}" + opt="${opt:0:2}" + else + arg=$1; shift; + fi + + # options with arguments + case $opt in -b|--batch) opt_batch="$arg";; -q|--queue) opt_queue="$arg";; -n|--ncore) opt_ncore="$arg";; -d|--delay) opt_delay="$arg";; -c|--command) opt_cmd="$arg";; - -o|--option) opt_option="${opt_option}${opt_option:+" "}$arg"; arg="parsed";; + -o|--option) opt_options_array+=("$arg"); arg="parsed";; *) quit $LINENO "unknown option: $opt" help;; esac if [ -z "$arg" ] || [ "${arg:0:1}" = "-" ]; then quit $LINENO "missing argument to $opt option" help fi done +opt_options="${opt_options_array[@]}" + +### begin script +log "BEGIN $0" + +### EGSnrc environment variables +log "EGSnrc environment:" +log " HEN_HOUSE = $HEN_HOUSE" +log " EGS_HOME = $EGS_HOME" +log " EGS_CONFIG = $EGS_CONFIG" +hen_house=${HEN_HOUSE%/} +egs_home=${EGS_HOME%/} ### check that the batch option command exists and is executable batch_script=$hen_house/scripts/egs-parallel-${opt_batch} if ! [ -x "$(command -v ${batch_script})" ]; then - quit $LINENO "no such batch option command found: ${batch_script}" + quit $LINENO "batch script not found: ${batch_script}" fi ### check that ncore is an integer @@ -151,12 +184,20 @@ log " delay = $opt_delay" log " command = $opt_cmd" log " basename = $basename" log " first job = $cmd_first" -log " options = $opt_option" +log " options = $opt_options" + +### redirect egs-parallel log file to application directory +logfile=$egs_home/$cmd_app/$basename.egsparallel +/bin/mv egs-parallel-$$.log $logfile +exec 3>>$logfile +log "egs-parallel log: $logfile" ### go to egs application directory log "cd $egs_home/$cmd_app" cd $egs_home/$cmd_app +### log script specific for this batch system +log "EXEC egs-parallel-$opt_batch $opt_queue $opt_ncore $opt_delay $cmd_first $basename '$opt_cmd' '$opt_options' $verbosity" + ### exec script specific for this batch system -log "EXEC egs-parallel-$opt_batch $opt_queue $opt_ncore $opt_delay $cmd_first $basename '$opt_cmd' '$opt_option'" -exec $batch_script $opt_queue $opt_ncore $opt_delay $cmd_first $basename "$opt_cmd" "$opt_option" \ No newline at end of file +exec $batch_script $opt_queue $opt_ncore $opt_delay $cmd_first $basename $opt_silent "$opt_cmd" "$opt_options" $verbosity \ No newline at end of file From 996919b5eb2157fcde407279d375cbd216cebfa9 Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Tue, 25 Aug 2020 17:58:25 -0400 Subject: [PATCH 05/21] Improve egs-parallel sub-scripts Apart from format and other minor adjustments, update the standard pbs script egs-parallel-pbs (whereby EGSnrc jobs are submitted individually) so that only the second job waits for the .egsjob file and the .lock file, since the jobs are submitted sequentially anyways. --- HEN_HOUSE/scripts/egs-parallel-dshtask | 34 ++++--- HEN_HOUSE/scripts/egs-parallel-pbs | 124 +++++++++++++------------ HEN_HOUSE/scripts/egs-parallel-pbsdsh | 17 ++-- 3 files changed, 98 insertions(+), 77 deletions(-) diff --git a/HEN_HOUSE/scripts/egs-parallel-dshtask b/HEN_HOUSE/scripts/egs-parallel-dshtask index 8eca7c174..4b5819418 100755 --- a/HEN_HOUSE/scripts/egs-parallel-dshtask +++ b/HEN_HOUSE/scripts/egs-parallel-dshtask @@ -23,13 +23,18 @@ function help { EOF } +### timestamp function +function timestamp { + printf "EGSnrc egs-parallel $(date --rfc-3339=ns)" +} + ### log function to write messages to log file and standard output function log { - msg="EGSnrc $(date --rfc-3339=ns): $1\n" + msg="$(timestamp): $1\n" printf "$msg" } -### quit function +### quit function for errors, with source, line, message and command function quit { lineno=$1 msg=$2 @@ -42,17 +47,19 @@ function quit { ### quit function if simulation is done function quit_if_done { - done=$(grep -o END $basename.egsjob) - if [ "$done" = "END" ]; then - log "$jobstr: QUIT (simulation already finished)" - exit + if [ -r $basename.egsjob ]; then + done=$(grep -o END $basename.egsjob) + if [ "$done" = "END" ]; then + log "$jobstr: QUIT (simulation already finished)" + exit + fi fi } ### go to pbs working directory cd $PBS_O_WORKDIR -### parse arguments +### parse command-line arguments (simplistic) args_min=6 if [ "$#" -lt $args_min ]; then quit $LINENO "only $# arguments provided; at least $args_min required" help @@ -69,7 +76,7 @@ task=${PBS_TASKNUM} taskstr="task $task" prefix="$pbsdsh_dir/${basename}_" taskfile=${prefix}${task}.task -log "$taskstr: $HOSTNAME: $taskfile" +log "$taskstr: host $HOSTNAME: $taskfile" touch $taskfile ### wait until all tasks have launched @@ -114,7 +121,7 @@ log "$jobstr: host=$(hostname) BEGIN pid=$$" if [ $job -eq 1 ]; then # log host and pid of job 1 in .egsjob file - log "$jobstr: host=$(hostname) BEGIN pid=$$" > $basename.egsjob + log "$jobstr: BEGIN host=$(hostname) pid=$$" > $basename.egsjob else @@ -191,14 +198,13 @@ fi ### launch the job if simulation is not done already quit_if_done source $HEN_HOUSE/scripts/egsnrc_bashrc_additions -log "$jobstr: RUN $command -b -P $ncore-j $job" -$command b -P $ncore -j $job +log "$jobstr: RUN $command -b -P $ncore -j $job -f $first" +$command -b -P $ncore -j $job -f $first ### report that this job is done -log "$jobstr: host=$(hostname) END pid=$$" +log "$jobstr: DONE." ### report that the simulation (job 1) is done if [ $job -eq 1 ]; then - log "$jobstr: host=$(hostname) END pid=$$" >> $basename.egsjob - log "$jobstr: FINISH simulation" + log "$jobstr: END host=$(hostname) pid=$$" >> $basename.egsjob fi \ No newline at end of file diff --git a/HEN_HOUSE/scripts/egs-parallel-pbs b/HEN_HOUSE/scripts/egs-parallel-pbs index 1c7e11d5e..f69983bbb 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbs +++ b/HEN_HOUSE/scripts/egs-parallel-pbs @@ -10,7 +10,7 @@ function help { $(basename $0) queue ncore delay first basename 'command' ['others'] [verbose] This script is not meant to be called directly, but rather via the egs-parallel script - with the batch option "-b pbsdsh" + with the batch option "-b pbs" arguments: @@ -26,16 +26,21 @@ function help { EOF } +### timestamp function +function timestamp { + printf "EGSnrc egs-parallel $(date --rfc-3339=ns)" +} + ### log function to write messages to log file and standard output function log { - msg="EGSnrc egs-parallel $(date --rfc-3339=ns): $1\n" + msg="$(timestamp): $1\n" printf "$msg" >&3 if [ "$verbosity" = "verbose" ]; then printf "$msg" fi } -### quit function +### quit function for errors, with source, line, message and command function quit { lineno=$1 msg=$2 @@ -92,61 +97,62 @@ for job in $(seq 1 $ncore); do # job label jobstr=$(printf "job %04d" $job) - ### manage jobs to avoid bottleneck and race conditions - if [ $job -gt 1 ]; then + # job 2 does all the waiting + if [ $job -eq 2 ]; then - # job 2 does all the waiting + # wait a fixed delay (relative to first job) delta=2 - if [ $job -eq 2 ]; then - - # wait a fixed delay (relative to first job) - log "$jobstr: wait $delta seconds (initial delay)" + log "$jobstr: wait $delta seconds (initial delay)" + sleep $delta + + # wait until there is an .egsjob file (maximum 120 seconds) + total=0 + delta=10 + limit=120 + while [ ! -e $basename.egsjob ]; do + log "$jobstr: wait $delta seconds (no $basename.egsjob file after $total seconds)" sleep $delta + total=$((total+$delta)) + if [ $total -gt $limit ]; then + log "$jobstr: QUIT (no $basename.egsjob file after $limit seconds)" + exit + fi + done - # wait until there is an .egsjob file (maximum 120 seconds) - total=0 - delta=10 - limit=120 - while [ ! -e $basename.egsjob ]; do - log "$jobstr: wait $delta seconds (no $basename.egsjob file after $total seconds)" - sleep $delta - total=$((total+$delta)) - if [ $total -gt $limit ]; then - log "$jobstr: QUIT (no $basename.egsjob file after $limit seconds)" - exit - fi - done + # quit if simulation is already done + quit_if_done + + # sleep until there is a lock file, maximum 300 seconds + total=0 + delta=10 + limit=300 + while [ ! -e $basename.lock ]; do # quit if simulation is already done quit_if_done - # sleep until there is a lock file, maximum 300 seconds - total=0 - delta=10 - limit=300 - while [ ! -e $basename.lock ]; do - - # quit if simulation is already done - quit_if_done - - # otherwise wait for lock file - log "$jobstr: wait $delta seconds ($basename.lock not found after $total seconds)" - sleep $delta - total=$((total+$delta)) - if [ $total -gt $limit ]; then - log "$jobstr: QUIT ($basename.lock not found after $limit seconds)" - exit - fi - done - fi + # otherwise wait for lock file + log "$jobstr: wait $delta seconds ($basename.lock not found after $total seconds)" + sleep $delta + total=$((total+$delta)) + if [ $total -gt $limit ]; then + log "$jobstr: QUIT ($basename.lock not found after $limit seconds)" + exit + fi + done + fi + + ### manage jobs to avoid bottleneck and race conditions + if [ $job -gt 1 ]; then # quit if simulation is already done quit_if_done # offset all jobs by a fixed delay (relative to previous job) - delta=500000 + delta=250000 log "$jobstr: wait $delta microseconds (default job offset delay)" usleep $delta + quit_if_done # extra user-specified delay between each job delta=$delay @@ -154,6 +160,7 @@ for job in $(seq 1 $ncore); do log "$jobstr: wait $delta seconds (user job offset delay)" sleep $delta fi + quit_if_done # report on lock file content if [ -r $basename.lock ]; then @@ -163,13 +170,14 @@ for job in $(seq 1 $ncore); do log "$jobstr: QUIT ($basename.lock does not exist or is not readable)" exit fi + quit_if_done fi - ### launch the job if simulation is not done already - quit_if_done - log "$jobstr: RUN $command -b -P $ncore -j $job" + ### launch the job if simulation + runcommand="$command -b -P $ncore -j $job -f $first" + log "$jobstr: SUBMIT $runcommand" source $HEN_HOUSE/scripts/egsnrc_bashrc_additions - jobid=$(qsub -q $queue $scheduler_options < \$PBS_O_WORKDIR/$basename.egsjob + log "$jobstr: BEGIN host=\$(hostname) pid=\$pid" > \$PBS_O_WORKDIR/$basename.egsjob fi +log "$jobstr: host=\$(hostname) pid=\$pid" +log "$jobstr: RUN $runcommand" -### run command -$command -b -P $ncore -j $job - -### log completion -log "$jobstr: PBS host=\$(hostname) END pid=\$\$" +### wait for completion and log +wait +log "$jobstr: DONE." if [ $job -eq 1 ]; then - log "$jobstr: host=\$(hostname) END pid=\$\$" >> \$PBS_O_WORKDIR/$basename.egsjob - log "$jobstr: FINISH simulation" + log "$jobstr: END host=\$(hostname) pid=\$pid" >> \$PBS_O_WORKDIR/$basename.egsjob fi EOF ) - echo $jobid + echo $jobpid done diff --git a/HEN_HOUSE/scripts/egs-parallel-pbsdsh b/HEN_HOUSE/scripts/egs-parallel-pbsdsh index c4a30edf3..a61dbb1d0 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbsdsh +++ b/HEN_HOUSE/scripts/egs-parallel-pbsdsh @@ -26,16 +26,21 @@ function help { EOF } +### timestamp function +function timestamp { + printf "EGSnrc egs-parallel $(date --rfc-3339=ns)" +} + ### log function to write messages to log file and standard output function log { - msg="EGSnrc $(date --rfc-3339=ns): $1\n" + msg="$(timestamp): $1\n" printf "$msg" >&3 if [ "$verbosity" = "verbose" ]; then printf "$msg" fi } -### quit function +### quit function for errors, with source, line, message and command function quit { lineno=$1 msg=$2 @@ -83,12 +88,12 @@ if [ -e $pbsdsh_dir ]; then /bin/rm -r $pbsdsh_dir fi err=$(mkdir $pbsdsh_dir 2>&1) -if ! [[ -z $err ]]; then +if ! [ -z $err ]; then quit $LINENO "$err" fi ### launch pbsdsh tasks task_script=$HEN_HOUSE/scripts/egs-parallel-dshtask -jobid=$(qsub -q $queue $scheduler_options < Date: Tue, 25 Aug 2020 18:03:04 -0400 Subject: [PATCH 06/21] Add an egs-parallel sub-script for multicore cpus This egs-parallel-cpu subscript provides the option "--batch cpu" to egs-parallel, to launch a simulation on multiple cores on the local cpu, without requiring a job scheduler. Intentionally, this script is simple: it just launches the jobs sequentially, without waiting around for the .egsjob or .lock files, as in the pbs scripts. However, the logging is consistent with the other egs-parallel scripts The number of threads is always constrained to the number of threads available on the machine, because it is inefficient to go beyond that, and launching a large number of threads on a cpu by mistake may well stall the computer. --- HEN_HOUSE/scripts/egs-parallel-cpu | 110 +++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100755 HEN_HOUSE/scripts/egs-parallel-cpu diff --git a/HEN_HOUSE/scripts/egs-parallel-cpu b/HEN_HOUSE/scripts/egs-parallel-cpu new file mode 100755 index 000000000..908c3de6b --- /dev/null +++ b/HEN_HOUSE/scripts/egs-parallel-cpu @@ -0,0 +1,110 @@ +#!/bin/bash + +### help function +function help { + log "HELP" + cat <&3 + if [ "$verbosity" = "verbose" ]; then + printf "$msg" + fi +} + +### quit function for errors, with source, line, message and command +function quit { + lineno=$1 + msg=$2 + case $3 in + help) cmd="help";; + *) cmd="";; + esac + verbosity="verbose" + log "$0 line $lineno: $msg"; $cmd; log "QUIT."; exit 1 +} + +### begin script +log "BEGIN $0" + +### parse command-line arguments (simplistic) +args_min=6 +if [ "$#" -lt $args_min ]; then + quit $LINENO "only $# arguments provided; at least $args_min required" help +fi +queue=$1 +ncore=$2 +delay=$3 +first=$4 +basename=$5 +command=$6 +scheduler_options=$7 +verbosity=$8 + +### link file descriptor 3 to egs-parallel log file +exec 3>>$basename.egsparallel + +### begin script +log "BEGIN $0" + +### restrict number of jobs to the number of cpu threads +cpu_nthread=$(grep -c processor /proc/cpuinfo) +if [ $ncore -gt $cpu_nthread ]; then + log "reduce requested threads ($ncore) to match available cpu threads ($cpu_nthread)" + ncore=$cpu_nthread +fi + +### loop to launch ncore jobs on cpu +for job in $(seq 1 $ncore); do + + # job label + jobstr=$(printf "job %04d" $job) + + ### launch the job + runcommand="$command -b -P $ncore -j $job -f $first" + source $HEN_HOUSE/scripts/egsnrc_bashrc_additions + $runcommand >/dev/null 2>&1 & + pid=$(printf $!) + + ### update .egsjob file + if [ $job -eq 1 ]; then + printf "$(timestamp): $jobstr: BEGIN host=\$(hostname) pid=$$\n" > $basename.egsjob + log "BEGIN host=$(hostname)" + fi + log "$jobstr: RUN $runcommand" + log "$jobstr: host=$(hostname) pid=$pid" + +done + +# wait for all simulations to finish +wait +printf "$(timestamp): $jobstr: END host=\$(hostname) pid=$$\n" >> $basename.egsjob +log "DONE." From bc4ebc7b854576c4cdcc354f39301ec0d42bd231 Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Tue, 25 Aug 2020 18:16:34 -0400 Subject: [PATCH 07/21] Overhaul script to tidy up after egs-parallel runs Improve the script robustness, in particular by forcing the user to specify either the -n (--dry-run) option, or the -f (--force) option to actually remove files, to prevent accidental erasing (to some extent). This script removes files without warnings (when using -f), so use with caution: run with the -n option first to see what will be deleted. Add the concatenation and sorting of egs-parallel log messages into the .egsparallel file for reference. This is useful, because these log messages may be scattered in different files, for example the .eo files from pbs. After cleaning, the .egsparallel contains a time-ordered sequence of messages from egs-parallel and its subscripts. --- HEN_HOUSE/scripts/egs-parallel-clean | 154 ++++++++++++++++++++++----- 1 file changed, 130 insertions(+), 24 deletions(-) diff --git a/HEN_HOUSE/scripts/egs-parallel-clean b/HEN_HOUSE/scripts/egs-parallel-clean index 8c6e8d9ab..f7ac3fa6e 100755 --- a/HEN_HOUSE/scripts/egs-parallel-clean +++ b/HEN_HOUSE/scripts/egs-parallel-clean @@ -1,50 +1,156 @@ #!/bin/bash -# usage display -usage() { - echo -e "\nPlease provide at least one simulation basename." - echo -e "\nUSAGE:\n$(basename $0) [ basename1 basename2 ... ]\n" +### help function +function help { + cat </dev/null) + if [ -n "$eofiles" ]; then + dirty="yes" + log "join egs-parallel log files into $f.egsparallel" + if [ "$opt_delete" = "yes" ]; then + cat $f.egsparallel $eofiles 2>/dev/null | grep 'EGSnrc egs-parallel' | sort >$f.egsparallel.combined + /bin/mv $f.egsparallel.combined $f.egsparallel + fi + fi + done # loop over file extensions - for ext in lock eo mederr egsjob; do + for ext in $clean_ext; do # remove files - count=$( find -maxdepth 1 -name "${name}.${ext}" | wc -l ) + count=$( find -maxdepth 1 -name "${name}${ext}" | wc -l ) if [ $count -gt 0 ]; then - for f in ${name}.${ext}; do - (set -x; /bin/rm $f) + dirty="yes" + for f in ${name}${ext}; do + list_or_delete "$f" done fi done - # remove work files count=$( find -maxdepth 1 -name "${name}_w*" | wc -l ) if [ $count -gt 0 ]; then - echo "+ /bin/rm ${name}_w*" - /bin/rm ${name}_w* + dirty="yes" + list_or_delete "${name}_w*" fi # remove egsrun directories count=$( find -maxdepth 1 -name "egsrun_*_${name}_*" | wc -l ) if [ $count -gt 0 ]; then - echo "+ /bin/rm egsrun_*_${name}_*" - /bin/rm -r egsrun_*_${name}_* + dirty="yes" + list_or_delete "egsrun_*_${name}_*" fi -done + if [ "$dirty" = "no" ]; then + log "(nothing to clean)" + fi + printf "" + +done \ No newline at end of file From e89a4a1ee4e79e8660a0446e11c23d1c142506e5 Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Tue, 25 Aug 2020 18:44:57 -0400 Subject: [PATCH 08/21] Change ncore to nthread in egs-parallel scripts Strictly speaking, there can be multiple threads per hardwarde core; this is typical in modern workstations. Change "ncore" to "nthread" throughout the egs-parallel scripts, to avoid confusion. --- HEN_HOUSE/scripts/egs-parallel | 18 +++++++++--------- HEN_HOUSE/scripts/egs-parallel-cpu | 18 +++++++++--------- HEN_HOUSE/scripts/egs-parallel-dshtask | 16 ++++++++-------- HEN_HOUSE/scripts/egs-parallel-pbs | 16 ++++++++-------- HEN_HOUSE/scripts/egs-parallel-pbsdsh | 14 +++++++------- 5 files changed, 41 insertions(+), 41 deletions(-) diff --git a/HEN_HOUSE/scripts/egs-parallel b/HEN_HOUSE/scripts/egs-parallel index ea1442461..b95faa5e1 100755 --- a/HEN_HOUSE/scripts/egs-parallel +++ b/HEN_HOUSE/scripts/egs-parallel @@ -15,7 +15,7 @@ function help { -b | --batch batch system to use ("pbsdsh" by default) -d | --delay delay in seconds between individual jobs -q | --queue scheduler queue ("long" by default) - -n | --ncore number of cores ("8" by default) + -n | --nthread number of threads ("8" by default) -o | --option option(s) to pass to job scheduler, in quotes -v | --verbose echo detailed egs-parallel log messages to terminal -c | --command command to run, given in quotes @@ -55,7 +55,7 @@ exec 3>egs-parallel-$$.log ### default option values opt_batch="pbsdsh" opt_queue="long" -opt_ncore="8" +opt_nthread="8" opt_delay="0" opt_command="" opt_options="" @@ -86,7 +86,7 @@ while [ "$#" -gt 0 ]; do case $opt in -b|--batch) opt_batch="$arg";; -q|--queue) opt_queue="$arg";; - -n|--ncore) opt_ncore="$arg";; + -n|--nthread) opt_nthread="$arg";; -d|--delay) opt_delay="$arg";; -c|--command) opt_cmd="$arg";; -o|--option) opt_options_array+=("$arg"); arg="parsed";; @@ -115,9 +115,9 @@ if ! [ -x "$(command -v ${batch_script})" ]; then quit $LINENO "batch script not found: ${batch_script}" fi -### check that ncore is an integer -if ! [[ "$opt_ncore" =~ ^[0-9]+$ ]] ; then - quit $LINENO "number of cores (-n option) is not an integer: $opt_ncore" +### check that nthread is an integer +if ! [[ "$opt_nthread" =~ ^[0-9]+$ ]] ; then + quit $LINENO "number of threads (-n option) is not an integer: $opt_nthread" fi ### check that delay is an integer @@ -179,7 +179,7 @@ fi log "parallel options:" log " batch = $opt_batch" log " queue = $opt_queue" -log " ncore = $opt_ncore" +log " nthread = $opt_nthread" log " delay = $opt_delay" log " command = $opt_cmd" log " basename = $basename" @@ -197,7 +197,7 @@ log "cd $egs_home/$cmd_app" cd $egs_home/$cmd_app ### log script specific for this batch system -log "EXEC egs-parallel-$opt_batch $opt_queue $opt_ncore $opt_delay $cmd_first $basename '$opt_cmd' '$opt_options' $verbosity" +log "EXEC egs-parallel-$opt_batch $opt_queue $opt_nthread $opt_delay $cmd_first $basename '$opt_cmd' '$opt_options' $verbosity" ### exec script specific for this batch system -exec $batch_script $opt_queue $opt_ncore $opt_delay $cmd_first $basename $opt_silent "$opt_cmd" "$opt_options" $verbosity \ No newline at end of file +exec $batch_script $opt_queue $opt_nthread $opt_delay $cmd_first $basename $opt_silent "$opt_cmd" "$opt_options" $verbosity \ No newline at end of file diff --git a/HEN_HOUSE/scripts/egs-parallel-cpu b/HEN_HOUSE/scripts/egs-parallel-cpu index 908c3de6b..c9edb5281 100755 --- a/HEN_HOUSE/scripts/egs-parallel-cpu +++ b/HEN_HOUSE/scripts/egs-parallel-cpu @@ -7,7 +7,7 @@ function help { usage: - $(basename $0) queue ncore delay first basename 'command' ['others'] [verbose] + $(basename $0) queue nthread delay first basename 'command' ['others'] [verbose] This script is not meant to be called directly, but rather via the egs-parallel script with the batch option "-b cpu" @@ -15,7 +15,7 @@ function help { arguments: queue queue name on the pbs scheduler - ncore number of computing cores to use + nthread number of threads to use (number of jobs; at most available cpu threads) delay delay in seconds between individual jobs basename simulation base name (input file name, without ".egsinp" extension) first first job index @@ -61,7 +61,7 @@ if [ "$#" -lt $args_min ]; then quit $LINENO "only $# arguments provided; at least $args_min required" help fi queue=$1 -ncore=$2 +nthread=$2 delay=$3 first=$4 basename=$5 @@ -77,19 +77,19 @@ log "BEGIN $0" ### restrict number of jobs to the number of cpu threads cpu_nthread=$(grep -c processor /proc/cpuinfo) -if [ $ncore -gt $cpu_nthread ]; then - log "reduce requested threads ($ncore) to match available cpu threads ($cpu_nthread)" - ncore=$cpu_nthread +if [ $nthread -gt $cpu_nthread ]; then + log "reduce requested threads ($nthread) to match available cpu threads ($cpu_nthread)" + nthread=$cpu_nthread fi -### loop to launch ncore jobs on cpu -for job in $(seq 1 $ncore); do +### loop to launch nthread jobs on cpu +for job in $(seq 1 $nthread); do # job label jobstr=$(printf "job %04d" $job) ### launch the job - runcommand="$command -b -P $ncore -j $job -f $first" + runcommand="$command -b -P $nthread -j $job -f $first" source $HEN_HOUSE/scripts/egsnrc_bashrc_additions $runcommand >/dev/null 2>&1 & pid=$(printf $!) diff --git a/HEN_HOUSE/scripts/egs-parallel-dshtask b/HEN_HOUSE/scripts/egs-parallel-dshtask index 4b5819418..ba9300306 100755 --- a/HEN_HOUSE/scripts/egs-parallel-dshtask +++ b/HEN_HOUSE/scripts/egs-parallel-dshtask @@ -7,7 +7,7 @@ function help { usage: - $(basename $0) pbsdsh_dir basename ncore first delay 'command' + $(basename $0) pbsdsh_dir basename nthread first delay 'command' This script is not meant to be called directly, but rather from the egs-parallel-pbsdsh script @@ -15,7 +15,7 @@ function help { pbsdsh_dir existing directory where to store task files for job numbers basename simulation base name (input file name, without ".egsinp" extension) - ncore number of task to run under dsh + nthread number of task to run under dsh first first job index delay user-specified delay in seconds between individual jobs command command to run, in quotes @@ -66,7 +66,7 @@ if [ "$#" -lt $args_min ]; then fi pbsdsh_dir=$1 basename=$2 -ncore=$3 +nthread=$3 first=$4 delay=$5 command=$6 @@ -82,8 +82,8 @@ touch $taskfile ### wait until all tasks have launched delta=2 filecount=$(ls -Ub1 -- $prefix*.task | wc -l) -while [ $filecount -lt $ncore ]; do - log "$taskstr: wait $delta seconds for all tasks to start ($filecount/$ncore)" +while [ $filecount -lt $nthread ]; do + log "$taskstr: wait $delta seconds for all tasks to start ($filecount/$nthread)" sleep $delta filecount=$(ls -Ub1 -- $prefix*.task | wc -l) done @@ -94,7 +94,7 @@ manager=$(basename ${manager#${prefix}} .task) ### manager assigns task indices if [ $manager == $task ]; then - log "$taskstr: manager: assigning $ncore tasks" + log "$taskstr: manager: assigning $nthread tasks" job=$first for f in $(ls ${prefix}*.task); do printf "$job\n" > $f @@ -198,8 +198,8 @@ fi ### launch the job if simulation is not done already quit_if_done source $HEN_HOUSE/scripts/egsnrc_bashrc_additions -log "$jobstr: RUN $command -b -P $ncore -j $job -f $first" -$command -b -P $ncore -j $job -f $first +log "$jobstr: RUN $command -b -P $nthread -j $job -f $first" +$command -b -P $nthread -j $job -f $first ### report that this job is done log "$jobstr: DONE." diff --git a/HEN_HOUSE/scripts/egs-parallel-pbs b/HEN_HOUSE/scripts/egs-parallel-pbs index f69983bbb..bb750dd96 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbs +++ b/HEN_HOUSE/scripts/egs-parallel-pbs @@ -7,7 +7,7 @@ function help { usage: - $(basename $0) queue ncore delay first basename 'command' ['others'] [verbose] + $(basename $0) queue nthread delay first basename 'command' ['others'] [verbose] This script is not meant to be called directly, but rather via the egs-parallel script with the batch option "-b pbs" @@ -15,7 +15,7 @@ function help { arguments: queue queue name on the pbs scheduler - ncore number of computing cores to use + nthread number of computing threads to use (how many jobs) delay delay in seconds between individual jobs basename simulation base name (input file name, without ".egsinp" extension) first first job index @@ -72,7 +72,7 @@ if [ "$#" -lt $args_min ]; then quit $LINENO "only $# arguments provided; at least $args_min required" help fi queue=$1 -ncore=$2 +nthread=$2 delay=$3 first=$4 basename=$5 @@ -84,15 +84,15 @@ verbosity=$8 exec 3>>$basename.egsparallel ### set scheduler job name (maximum 14 characters) -jobname=$(echo ${basename}[$ncore]) +jobname=$(echo ${basename}[$nthread]) trim=$(( $(echo $jobname | wc -c) - 14 )) if [ $trim -gt 0 ]; then jobname=$(echo $jobname | cut -c $trim-) fi log "job name: $jobname" -### loop to launch ncore pbs jobs -for job in $(seq 1 $ncore); do +### loop to launch nthread pbs jobs +for job in $(seq 1 $nthread); do # job label jobstr=$(printf "job %04d" $job) @@ -173,8 +173,8 @@ for job in $(seq 1 $ncore); do quit_if_done fi - ### launch the job if simulation - runcommand="$command -b -P $ncore -j $job -f $first" + ### launch the job + runcommand="$command -b -P $nthread -j $job -f $first" log "$jobstr: SUBMIT $runcommand" source $HEN_HOUSE/scripts/egsnrc_bashrc_additions jobpid=$(qsub -q $queue $scheduler_options <>$basename.egsparallel log "BEGIN $0" ### set scheduler job name (maximum 14 characters) -jobname=$(echo ${basename}[$ncore]) +jobname=$(echo ${basename}[$nthread]) trim=$(( $(echo $jobname | wc -c) - 14 )) if [ $trim -gt 0 ]; then jobname=$(echo $jobname | cut -c $trim-) @@ -98,12 +98,12 @@ jobpid=$(qsub -q $queue $scheduler_options < Date: Wed, 26 Aug 2020 09:44:47 -0400 Subject: [PATCH 09/21] Add HEN_HOUSE/scripts/bin directory, add to path Add a bin directory in HEN_HOUSE/scripts and add it to the PATH in the shell additions scripts. This allows some EGSnrc scripts to be directly executable by a user, without using aliases (which are not inherited by subshells). The immediate motivation is for the top-level egs-parallel script, and the egs-parallel-clean script, to become visible on the path, while the egs-parallel sub-scripts remain in scripts and are not in the path (these should not be invoked directly). --- HEN_HOUSE/scripts/{ => bin}/egs-parallel | 0 HEN_HOUSE/scripts/{ => bin}/egs-parallel-clean | 0 HEN_HOUSE/scripts/egsnrc_bashrc_additions | 3 ++- HEN_HOUSE/scripts/egsnrc_cshrc_additions | 1 + HEN_HOUSE/scripts/egsnrc_fishrc_additions | 4 ++-- 5 files changed, 5 insertions(+), 3 deletions(-) rename HEN_HOUSE/scripts/{ => bin}/egs-parallel (100%) rename HEN_HOUSE/scripts/{ => bin}/egs-parallel-clean (100%) diff --git a/HEN_HOUSE/scripts/egs-parallel b/HEN_HOUSE/scripts/bin/egs-parallel similarity index 100% rename from HEN_HOUSE/scripts/egs-parallel rename to HEN_HOUSE/scripts/bin/egs-parallel diff --git a/HEN_HOUSE/scripts/egs-parallel-clean b/HEN_HOUSE/scripts/bin/egs-parallel-clean similarity index 100% rename from HEN_HOUSE/scripts/egs-parallel-clean rename to HEN_HOUSE/scripts/bin/egs-parallel-clean diff --git a/HEN_HOUSE/scripts/egsnrc_bashrc_additions b/HEN_HOUSE/scripts/egsnrc_bashrc_additions index d24f53630..45093dea2 100644 --- a/HEN_HOUSE/scripts/egsnrc_bashrc_additions +++ b/HEN_HOUSE/scripts/egsnrc_bashrc_additions @@ -44,8 +44,9 @@ my_machine=`cat $EGS_CONFIG | grep "my_machine =" | sed "s/my_machine = //"` HEN_HOUSE=`cat $EGS_CONFIG | grep "HEN_HOUSE =" | sed "s/HEN_HOUSE = //"` export OMEGA_HOME=${HEN_HOUSE}omega -# Add HEN_HOUSE bin area to the path +# Add HEN_HOUSE bin and scripts/bin to the path # +PATH="${HEN_HOUSE}scripts/bin:$PATH" PATH="${HEN_HOUSE}bin/$my_machine:$PATH" # Now check for EGS_HOME. diff --git a/HEN_HOUSE/scripts/egsnrc_cshrc_additions b/HEN_HOUSE/scripts/egsnrc_cshrc_additions index 4e9c17406..65f2db0f7 100644 --- a/HEN_HOUSE/scripts/egsnrc_cshrc_additions +++ b/HEN_HOUSE/scripts/egsnrc_cshrc_additions @@ -46,6 +46,7 @@ setenv OMEGA_HOME $HEN_HOUSE/omega # Path # +setenv PATH "${HEN_HOUSE}scripts/bin:${PATH}" setenv PATH "${HEN_HOUSE}bin/${my_machine}:${PATH}" if (-d ${EGS_HOME}bin/$my_machine) then setenv PATH "${EGS_HOME}bin/${my_machine}:${PATH}" diff --git a/HEN_HOUSE/scripts/egsnrc_fishrc_additions b/HEN_HOUSE/scripts/egsnrc_fishrc_additions index c816b308b..833ff5c3c 100644 --- a/HEN_HOUSE/scripts/egsnrc_fishrc_additions +++ b/HEN_HOUSE/scripts/egsnrc_fishrc_additions @@ -46,9 +46,9 @@ set -x my_machine (cat $EGS_CONFIG | grep "my_machine =" | sed "s/my_machine = / set -x HEN_HOUSE (cat $EGS_CONFIG | grep "HEN_HOUSE =" | sed "s/HEN_HOUSE = //") set -x OMEGA_HOME {$HEN_HOUSE}omega -# Add HEN_HOUSE bin area to the path +# Add HEN_HOUSE bin and scripts/bin to the path # -set -x PATH $PATH {$HEN_HOUSE}bin/$my_machine +set -x PATH $PATH {$HEN_HOUSE}bin/$my_machine {$HEN_HOUSE}scripts/bin # Now check for EGS_HOME. # If EGS_HOME is not empty and exists, check for $EGS_HOME/bin and From 73cead249ab5007f3c77f4f3c11bd42e2964467b Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Wed, 26 Aug 2020 09:53:34 -0400 Subject: [PATCH 10/21] Remove shell additions sourcing in egs-parallel Do not source the shell additions scripts from within the egs-parallel sub-scripts, as this is not necessary and not secure. Sourcing was only needed in the dshtask script to get the path to the EGSnrc executables, because tasks are launched on the pbs nodes without inheriting the environment. In this case, simply export the PATH variable via the pbsdsh qsub script. --- HEN_HOUSE/scripts/egs-parallel-cpu | 1 - HEN_HOUSE/scripts/egs-parallel-dshtask | 1 - HEN_HOUSE/scripts/egs-parallel-pbs | 1 - HEN_HOUSE/scripts/egs-parallel-pbsdsh | 2 +- 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/HEN_HOUSE/scripts/egs-parallel-cpu b/HEN_HOUSE/scripts/egs-parallel-cpu index c9edb5281..5ce9dd1d0 100755 --- a/HEN_HOUSE/scripts/egs-parallel-cpu +++ b/HEN_HOUSE/scripts/egs-parallel-cpu @@ -90,7 +90,6 @@ for job in $(seq 1 $nthread); do ### launch the job runcommand="$command -b -P $nthread -j $job -f $first" - source $HEN_HOUSE/scripts/egsnrc_bashrc_additions $runcommand >/dev/null 2>&1 & pid=$(printf $!) diff --git a/HEN_HOUSE/scripts/egs-parallel-dshtask b/HEN_HOUSE/scripts/egs-parallel-dshtask index ba9300306..bf0ef8e7d 100755 --- a/HEN_HOUSE/scripts/egs-parallel-dshtask +++ b/HEN_HOUSE/scripts/egs-parallel-dshtask @@ -197,7 +197,6 @@ fi ### launch the job if simulation is not done already quit_if_done -source $HEN_HOUSE/scripts/egsnrc_bashrc_additions log "$jobstr: RUN $command -b -P $nthread -j $job -f $first" $command -b -P $nthread -j $job -f $first diff --git a/HEN_HOUSE/scripts/egs-parallel-pbs b/HEN_HOUSE/scripts/egs-parallel-pbs index bb750dd96..d743db394 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbs +++ b/HEN_HOUSE/scripts/egs-parallel-pbs @@ -176,7 +176,6 @@ for job in $(seq 1 $nthread); do ### launch the job runcommand="$command -b -P $nthread -j $job -f $first" log "$jobstr: SUBMIT $runcommand" - source $HEN_HOUSE/scripts/egsnrc_bashrc_additions jobpid=$(qsub -q $queue $scheduler_options < Date: Wed, 26 Aug 2020 12:01:46 -0400 Subject: [PATCH 11/21] Tweak timetamp and usage in egs-parallel scripts Use a more portable date command format for the timestamp string, and tweak the usage message of egs-parallel scripts. --- HEN_HOUSE/scripts/bin/egs-parallel | 4 ++-- HEN_HOUSE/scripts/egs-parallel-cpu | 14 ++++++++------ HEN_HOUSE/scripts/egs-parallel-dshtask | 13 ++++++++----- HEN_HOUSE/scripts/egs-parallel-pbs | 14 ++++++++------ HEN_HOUSE/scripts/egs-parallel-pbsdsh | 12 +++++++----- 5 files changed, 33 insertions(+), 24 deletions(-) diff --git a/HEN_HOUSE/scripts/bin/egs-parallel b/HEN_HOUSE/scripts/bin/egs-parallel index b95faa5e1..858cb7689 100755 --- a/HEN_HOUSE/scripts/bin/egs-parallel +++ b/HEN_HOUSE/scripts/bin/egs-parallel @@ -23,9 +23,9 @@ function help { EOF } -### timestamp function +### timestamp functionq function timestamp { - printf "EGSnrc egs-parallel $(date --rfc-3339=ns)" + printf "EGSnrc egs-parallel $(date -u "+%Y-%m-%d (UTC) %H:%M:%S.%N")" } ### log function to write messages to log file and standard output diff --git a/HEN_HOUSE/scripts/egs-parallel-cpu b/HEN_HOUSE/scripts/egs-parallel-cpu index 5ce9dd1d0..e89c4bd34 100755 --- a/HEN_HOUSE/scripts/egs-parallel-cpu +++ b/HEN_HOUSE/scripts/egs-parallel-cpu @@ -9,26 +9,28 @@ function help { $(basename $0) queue nthread delay first basename 'command' ['others'] [verbose] - This script is not meant to be called directly, but rather via the egs-parallel script - with the batch option "-b cpu" - arguments: queue queue name on the pbs scheduler - nthread number of threads to use (number of jobs; at most available cpu threads) + nthread number of threads to use (number of jobs) delay delay in seconds between individual jobs - basename simulation base name (input file name, without ".egsinp" extension) first first job index + basename simulation input file name, without ".egsinp" extension command command to run, in quotes others other options passed to scheduler, in quotes verbose echo detailed egs-parallel log messages to terminal + note: + + This script is not meant to be called directly, but rather via the + egs-parallel script with the batch option "-b cpu" + EOF } ### timestamp function function timestamp { - printf "EGSnrc egs-parallel $(date --rfc-3339=ns)" + printf "EGSnrc egs-parallel $(date -u "+%Y-%m-%d (UTC) %H:%M:%S.%N")" } ### log function to write messages to log file and standard output diff --git a/HEN_HOUSE/scripts/egs-parallel-dshtask b/HEN_HOUSE/scripts/egs-parallel-dshtask index bf0ef8e7d..1de8af513 100755 --- a/HEN_HOUSE/scripts/egs-parallel-dshtask +++ b/HEN_HOUSE/scripts/egs-parallel-dshtask @@ -9,23 +9,26 @@ function help { $(basename $0) pbsdsh_dir basename nthread first delay 'command' - This script is not meant to be called directly, but rather from the egs-parallel-pbsdsh script - arguments: - pbsdsh_dir existing directory where to store task files for job numbers - basename simulation base name (input file name, without ".egsinp" extension) + pbsdsh_dir existing directory to save task files for job numbers + basename simulation input file name, without ".egsinp" extension nthread number of task to run under dsh first first job index delay user-specified delay in seconds between individual jobs command command to run, in quotes + note: + + This script is not meant to be called directly, but rather from the + egs-parallel-pbsdsh script + EOF } ### timestamp function function timestamp { - printf "EGSnrc egs-parallel $(date --rfc-3339=ns)" + printf "EGSnrc egs-parallel $(date -u "+%Y-%m-%d (UTC) %H:%M:%S.%N")" } ### log function to write messages to log file and standard output diff --git a/HEN_HOUSE/scripts/egs-parallel-pbs b/HEN_HOUSE/scripts/egs-parallel-pbs index d743db394..eb2a0224f 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbs +++ b/HEN_HOUSE/scripts/egs-parallel-pbs @@ -9,26 +9,28 @@ function help { $(basename $0) queue nthread delay first basename 'command' ['others'] [verbose] - This script is not meant to be called directly, but rather via the egs-parallel script - with the batch option "-b pbs" - arguments: queue queue name on the pbs scheduler - nthread number of computing threads to use (how many jobs) + nthread number of threads to use (number of jobs) delay delay in seconds between individual jobs - basename simulation base name (input file name, without ".egsinp" extension) first first job index + basename simulation input file name, without ".egsinp" extension command command to run, in quotes others other options passed to scheduler, in quotes verbose echo detailed egs-parallel log messages to terminal + note: + + This script is not meant to be called directly, but rather via the + egs-parallel script with the batch option "-b pbs" + EOF } ### timestamp function function timestamp { - printf "EGSnrc egs-parallel $(date --rfc-3339=ns)" + printf "EGSnrc egs-parallel $(date -u "+%Y-%m-%d (UTC) %H:%M:%S.%N")" } ### log function to write messages to log file and standard output diff --git a/HEN_HOUSE/scripts/egs-parallel-pbsdsh b/HEN_HOUSE/scripts/egs-parallel-pbsdsh index 2a7ae75fd..bec77b94f 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbsdsh +++ b/HEN_HOUSE/scripts/egs-parallel-pbsdsh @@ -9,26 +9,28 @@ function help { $(basename $0) queue nthread delay first basename 'command' ['others'] [verbose] - This script is not meant to be called directly, but rather via the egs-parallel script - with the batch option "-b pbsdsh" - arguments: queue queue name on the pbs scheduler nthread number of threads to use (number of jobs) delay delay in seconds between individual jobs first first job index - basename simulation base name (input file name, without ".egsinp" extension) + basename simulation input file name, without ".egsinp" extension command command to run, in quotes others other options passed to scheduler, in quotes verbose echo detailed egs-parallel log messages to terminal + note: + + This script is not meant to be called directly, but rather via the + egs-parallel script with the batch option "-b pbsdsh" + EOF } ### timestamp function function timestamp { - printf "EGSnrc egs-parallel $(date --rfc-3339=ns)" + printf "EGSnrc egs-parallel $(date -u "+%Y-%m-%d (UTC) %H:%M:%S.%N")" } ### log function to write messages to log file and standard output From 19393ee3ffb0e3e6d6be1a0c4fe4113a49bc09d4 Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Thu, 27 Aug 2020 08:40:19 -0400 Subject: [PATCH 12/21] Add -x and -v options to egs-parallel-clean Add -x (--extra) option to clean up egs-parallel log files .egsparallel and .egsparallel.eo. Although this script always echoes progress to the terminal, add a -v (--verbose) option to echo the commands that are run by the script, instead of the more concise messages usually reported. Internally, add an "action" command to ensure that the log messages remain up to date with the commands. --- HEN_HOUSE/scripts/bin/egs-parallel-clean | 138 ++++++++++++++--------- 1 file changed, 85 insertions(+), 53 deletions(-) diff --git a/HEN_HOUSE/scripts/bin/egs-parallel-clean b/HEN_HOUSE/scripts/bin/egs-parallel-clean index f7ac3fa6e..70e87f55d 100755 --- a/HEN_HOUSE/scripts/bin/egs-parallel-clean +++ b/HEN_HOUSE/scripts/bin/egs-parallel-clean @@ -6,19 +6,23 @@ function help { usage: - $(basename $0) [ -f | -n ] basename [ basename2 ... ] + $(basename $0) [options] { -f | -n } basename [ basename2 ... ] With -f flag, remove temporary files created by EGSnrc simulations for input files basename1.egsinp basename2.egsinp, etc. Run with -n flag to check - which files would be deleted; The list of basenames can also be given as a - glob pattern given in double quotes, as in $(basename $0) "myinput*", for example. - Also sorts and catenates output from job scheduler into the basename.egsparallel - log file. + which files would be deleted. Unless the -x option is given, also sort and merge + the output from egs-parallel output into a .egsparallel log file, and merge + the outputs from all the scheduler .eo files into a .egsparallel.eo file + + Note that the list of basenames can also be given as a glob pattern within quotes, + as in $(basename $0) "myinput*", for example. options and arguments: -f | --force remove temporary files (using /bin/rm -rf) -n | --dry-run show which files would be removed when using -f flag (default) + -x | --extra extra clean: also remove .egsparallel and .egsparallel.eo log files + -v | --verbose log the actual commands executed, instead of concise log messages basename simulation base name (input file name, with or without ".egsinp" extension) EOF @@ -41,24 +45,43 @@ function quit { } ### list or delete the file passed as the first argument -function list_or_delete { - slash="" - if [ -d "$1" ]; then - slash="/" - fi - if [ "$opt_delete" = "yes" ]; then - log "/bin/rm -r $1$slash" - /bin/rm -r $1 +function action { + + # process specific log message for the action + option_log=$1 + option_msg=$2 + if [ "$option_log" = "--log" ]; then + msg=$option_msg + if [ "$msg" = "--none" ]; then + msg="" + fi + shift 2; else - log "remove $1$slash" + msg="$@" fi -} -### begin script -log "$0" + # if verbose option is invoked, log the actual command that will be run + if [ "$opt_verbose" = "yes" ]; then + msg="$@" + fi + + # log action + if ! [ -z "$msg" ]; then + log "$msg" + fi + + # perform action if -f (--force) option is invoked + if [ "$opt_force" = "yes" ]; then + eval "$@" + fi + dirty="yes" + +} ### default option values -opt_delete="" +opt_force="" +opt_extra="no" +opt_verbose="no" declare -a opt_names ### parse command-line arguments (simplistic) @@ -66,91 +89,100 @@ while [ "$#" -gt 0 ]; do opt=$1; shift case $opt in -h|--help) help; exit;; - -f|--force) if [ -z $opt_delete ]; then - opt_delete="yes" + -f|--force) if [ -z "$opt_force" ]; then + opt_force="yes" fi;; - -n|--dry-run) opt_delete="no";; + -n|--dry-run) opt_force="no";; + -x|--extra) opt_extra="yes";; + -v|--verbose) opt_verbose="yes";; *) opt_names+=("${opt%.egsinp}");; esac done +### begin script +log "$(basename $0)" + ### ensure either -f or -n is specified -if [ -z "$opt_delete" ]; then +if [ -z "$opt_force" ]; then quit $LINENO "either option -f (--force) or -n (--dry-run) must be specified" help fi ### ensure there is at least one basename provided -if [ -z "${opt_names[@]}" ]; then +names="${opt_names[@]}" +if [ -z "$names" ]; then quit $LINENO "at least one simulation basename must be specified" help fi ### define filename extension to clean -clean_ext=".lock .mederr .egsjob .pbsdsh .eo .e .o" +clean_extensions=".lock .mederr .egsjob .pbsdsh .eo .e .o" ### loop over all names for name in "${opt_names[@]}"; do - # log name and indentation + ### log name and indentation tab="" dryrun="" - if [ "$opt_delete" = "no" ]; then + if [ "$opt_force" = "no" ]; then dryrun="(dry-run)" fi log "CLEANING ${name} ... $dryrun" tab=" " - # dirty flag and log tabulation + ### dirty flag and log tabulation dirty="no" - # log dry run - if [ "$opt_delete" = "no" ]; then + ### log dry run + if [ "$opt_force" = "no" ]; then log "with the -f (--force) option, cleaning would:" fi - # sort catenated egs-parallel log files into .egsparallel - for f in ${name}.egsjob; do - f=${f%.egsjob} - eofiles=($f.egsjob $f.o $f.eo ${f}_w*.o ${f}_w*.eo) - eofiles=$(ls echo ${eofiles[@]} 2>/dev/null) - if [ -n "$eofiles" ]; then - dirty="yes" - log "join egs-parallel log files into $f.egsparallel" - if [ "$opt_delete" = "yes" ]; then - cat $f.egsparallel $eofiles 2>/dev/null | grep 'EGSnrc egs-parallel' | sort >$f.egsparallel.combined - /bin/mv $f.egsparallel.combined $f.egsparallel + ### clean .egsparallel and .eo log files if -x (--extra) is invoked, otherwise catenate them + if [ "$opt_extra" = "yes" ]; then + clean_extensions="$clean_extensions .egsparallel .egsparallel.eo" + else + for f in ${name}.egsparallel; do + f=${f%.egsparallel} + files=($f.egsjob $f.o $f.eo ${f}_w*.o ${f}_w*.eo) + lsfiles=$(/bin/ls -dx ${files[@]} 2>/dev/null) + if ! [ -z "$lsfiles" ]; then + action --log "merge $f pbs sterr and stdout files into $f.egsparallel.eo" "cat $f.egsparallel ${files[@]} 2>/dev/null | grep -v 'EGSnrc egs-parallel' >$f.egsparallel.eo" + action --log "merge $f egs-parallel log files into $f.egsparallel" "cat $f.egsparallel ${files[@]} 2>/dev/null | grep 'EGSnrc egs-parallel' | sort >$f.egsparallel.merged" + action --log --none "mv $f.egsparallel.merged $f.egsparallel" fi - fi - done + done + fi - # loop over file extensions - for ext in $clean_ext; do + ### loop over file extensions + for ext in $clean_extensions; do # remove files count=$( find -maxdepth 1 -name "${name}${ext}" | wc -l ) if [ $count -gt 0 ]; then - dirty="yes" for f in ${name}${ext}; do - list_or_delete "$f" + if [ -d $f ]; then + f="${f%/}/" + fi + action --log "remove $f" "/bin/rm -r $f" done fi done - # remove work files + + ### remove work files count=$( find -maxdepth 1 -name "${name}_w*" | wc -l ) if [ $count -gt 0 ]; then - dirty="yes" - list_or_delete "${name}_w*" + action --log "remove ${name}_w*" "/bin/rm -r ${name}_w*" fi - # remove egsrun directories + ### remove egsrun directories count=$( find -maxdepth 1 -name "egsrun_*_${name}_*" | wc -l ) if [ $count -gt 0 ]; then - dirty="yes" - list_or_delete "egsrun_*_${name}_*" + action --log "remove egsrun_*_${name}_*/" "/bin/rm -r egsrun_*_${name}_*" fi + ### report if there is nothing to clean if [ "$dirty" = "no" ]; then log "(nothing to clean)" fi - printf "" + printf "\n" done \ No newline at end of file From 20fc34b507d735e3c3e131ed5bd4dd0087839664 Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Thu, 27 Aug 2020 14:06:30 -0400 Subject: [PATCH 13/21] Add -l (--list) option to egs-parallel-clean For convenience, add a -l (--list) option to the cleaning script to list all the .egslog file base names in the current directory. This option is checked first and overrides every other argument: the list is printed to the terminal and the script terminates. Also, reformat the usage message and use the extension .egsparallel-eo (with a hyphen) to avoid collision with the pbs .eo extension. Use executable basename in quit function. --- HEN_HOUSE/scripts/bin/egs-parallel | 2 +- HEN_HOUSE/scripts/bin/egs-parallel-clean | 80 +++++++++++++++--------- HEN_HOUSE/scripts/egs-parallel-cpu | 2 +- HEN_HOUSE/scripts/egs-parallel-dshtask | 2 +- HEN_HOUSE/scripts/egs-parallel-pbs | 2 +- HEN_HOUSE/scripts/egs-parallel-pbsdsh | 2 +- 6 files changed, 55 insertions(+), 35 deletions(-) diff --git a/HEN_HOUSE/scripts/bin/egs-parallel b/HEN_HOUSE/scripts/bin/egs-parallel index 858cb7689..a7e98248b 100755 --- a/HEN_HOUSE/scripts/bin/egs-parallel +++ b/HEN_HOUSE/scripts/bin/egs-parallel @@ -46,7 +46,7 @@ function quit { *) cmd="";; esac verbosity="verbose" - log "$0 line $lineno: $msg"; $cmd; log "QUIT."; exit 1 + log "$(basename $0): line $lineno: $msg"; $cmd; log "QUIT."; exit 1 } ### link file descriptor 3 to egs-parallel log file diff --git a/HEN_HOUSE/scripts/bin/egs-parallel-clean b/HEN_HOUSE/scripts/bin/egs-parallel-clean index 70e87f55d..99789d9d7 100755 --- a/HEN_HOUSE/scripts/bin/egs-parallel-clean +++ b/HEN_HOUSE/scripts/bin/egs-parallel-clean @@ -8,22 +8,26 @@ function help { $(basename $0) [options] { -f | -n } basename [ basename2 ... ] - With -f flag, remove temporary files created by EGSnrc simulations for - input files basename1.egsinp basename2.egsinp, etc. Run with -n flag to check - which files would be deleted. Unless the -x option is given, also sort and merge - the output from egs-parallel output into a .egsparallel log file, and merge - the outputs from all the scheduler .eo files into a .egsparallel.eo file + options and arguments: - Note that the list of basenames can also be given as a glob pattern within quotes, - as in $(basename $0) "myinput*", for example. + -l | --list only list the .egslog files in directory and exit + -f | --force remove temporary files (using /bin/rm -r) + -n | --dry-run show which files would be removed with -f flag + -x | --extra extra clean: also remove egs-parallel log files + -v | --verbose log the commands executed, instead of concise log messages + basename input file name, with or without the ".egsinp" extension - options and arguments: + description: - -f | --force remove temporary files (using /bin/rm -rf) - -n | --dry-run show which files would be removed when using -f flag (default) - -x | --extra extra clean: also remove .egsparallel and .egsparallel.eo log files - -v | --verbose log the actual commands executed, instead of concise log messages - basename simulation base name (input file name, with or without ".egsinp" extension) + With -f flag, remove temporary files created by EGSnrc simulations for + input files basename.egsinp basename2.egsinp, etc. Run with -n flag to + check which files would be deleted. Unless the -x option is given, also + sort and merge the output from egs-parallel output into a .egsparallel + log file, and merge the outputs from all the scheduler .eo files into a + .egsparallel-eo file + + Note that the list of basenames can also be given as a glob pattern + within quotes, as in $(basename $0) "myinput*", for example. EOF } @@ -41,7 +45,7 @@ function quit { help) cmd="help";; *) cmd="";; esac - log "$0 line $lineno: $msg"; $cmd; log "QUIT."; exit 1 + log "$(basename $0): line $lineno: $msg"; $cmd; log "QUIT."; exit 1 } ### list or delete the file passed as the first argument @@ -65,21 +69,22 @@ function action { msg="$@" fi - # log action - if ! [ -z "$msg" ]; then - log "$msg" - fi - # perform action if -f (--force) option is invoked if [ "$opt_force" = "yes" ]; then eval "$@" fi dirty="yes" + # log action + if ! [ -z "$msg" ]; then + log "$msg" + fi + } ### default option values opt_force="" +opt_listonly="no" opt_extra="no" opt_verbose="no" declare -a opt_names @@ -92,6 +97,7 @@ while [ "$#" -gt 0 ]; do -f|--force) if [ -z "$opt_force" ]; then opt_force="yes" fi;; + -l|--list) opt_listonly="yes";; -n|--dry-run) opt_force="no";; -x|--extra) opt_extra="yes";; -v|--verbose) opt_verbose="yes";; @@ -101,6 +107,21 @@ done ### begin script log "$(basename $0)" +log "current directory: $PWD/\n" + + +### just list the egslog basenames and exit if option -l is invoked +if [ "$opt_listonly" = "yes" ]; then + log "LIST of .egslog basenames:" + tab=" " + for f in $(ls -1dF *.egslog); do + if ! [[ "$f" =~ _w[0-9]+.egslog ]]; then + log "${f%.egslog}" + fi + done + printf "\nDONE.\n" + exit 0 +fi ### ensure either -f or -n is specified if [ -z "$opt_force" ]; then @@ -121,11 +142,7 @@ for name in "${opt_names[@]}"; do ### log name and indentation tab="" - dryrun="" - if [ "$opt_force" = "no" ]; then - dryrun="(dry-run)" - fi - log "CLEANING ${name} ... $dryrun" + log "CLEANING ${name} ..." tab=" " ### dirty flag and log tabulation @@ -133,20 +150,21 @@ for name in "${opt_names[@]}"; do ### log dry run if [ "$opt_force" = "no" ]; then - log "with the -f (--force) option, cleaning would:" + tab=" ! " + log "TESTING only: with the -f (--force) option, $(basename $0) WOULD:" fi ### clean .egsparallel and .eo log files if -x (--extra) is invoked, otherwise catenate them if [ "$opt_extra" = "yes" ]; then - clean_extensions="$clean_extensions .egsparallel .egsparallel.eo" + clean_extensions="$clean_extensions .egsparallel .egsparallel-eo" else for f in ${name}.egsparallel; do f=${f%.egsparallel} files=($f.egsjob $f.o $f.eo ${f}_w*.o ${f}_w*.eo) - lsfiles=$(/bin/ls -dx ${files[@]} 2>/dev/null) + lsfiles=$(/bin/ls -d ${files[@]} 2>/dev/null) if ! [ -z "$lsfiles" ]; then - action --log "merge $f pbs sterr and stdout files into $f.egsparallel.eo" "cat $f.egsparallel ${files[@]} 2>/dev/null | grep -v 'EGSnrc egs-parallel' >$f.egsparallel.eo" - action --log "merge $f egs-parallel log files into $f.egsparallel" "cat $f.egsparallel ${files[@]} 2>/dev/null | grep 'EGSnrc egs-parallel' | sort >$f.egsparallel.merged" + action --log "merge parallel output and error streams into $f.egsparallel-eo" "cat $f.egsparallel ${files[@]} 2>/dev/null | grep -v 'EGSnrc egs-parallel' >$f.egsparallel-eo" + action --log "merge egs-parallel log messages into $f.egsparallel" "cat $f.egsparallel ${files[@]} 2>/dev/null | grep 'EGSnrc egs-parallel' | sort >$f.egsparallel.merged" action --log --none "mv $f.egsparallel.merged $f.egsparallel" fi done @@ -183,6 +201,8 @@ for name in "${opt_names[@]}"; do if [ "$dirty" = "no" ]; then log "(nothing to clean)" fi + printf "\n" -done \ No newline at end of file +done +printf "DONE.\n" \ No newline at end of file diff --git a/HEN_HOUSE/scripts/egs-parallel-cpu b/HEN_HOUSE/scripts/egs-parallel-cpu index e89c4bd34..0580c7df6 100755 --- a/HEN_HOUSE/scripts/egs-parallel-cpu +++ b/HEN_HOUSE/scripts/egs-parallel-cpu @@ -51,7 +51,7 @@ function quit { *) cmd="";; esac verbosity="verbose" - log "$0 line $lineno: $msg"; $cmd; log "QUIT."; exit 1 + log "$(basename $0): line $lineno: $msg"; $cmd; log "QUIT."; exit 1 } ### begin script diff --git a/HEN_HOUSE/scripts/egs-parallel-dshtask b/HEN_HOUSE/scripts/egs-parallel-dshtask index 1de8af513..1da7abf43 100755 --- a/HEN_HOUSE/scripts/egs-parallel-dshtask +++ b/HEN_HOUSE/scripts/egs-parallel-dshtask @@ -45,7 +45,7 @@ function quit { help) cmd="help";; *) cmd="";; esac - log "$0 line $lineno: $msg"; $cmd; log "QUIT."; exit 1 + log "$(basename $0): line $lineno: $msg"; $cmd; log "QUIT."; exit 1 } ### quit function if simulation is done diff --git a/HEN_HOUSE/scripts/egs-parallel-pbs b/HEN_HOUSE/scripts/egs-parallel-pbs index eb2a0224f..5fc4edf6d 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbs +++ b/HEN_HOUSE/scripts/egs-parallel-pbs @@ -51,7 +51,7 @@ function quit { *) cmd="";; esac verbosity="verbose" - log "$0 line $lineno: $msg"; $cmd; log "QUIT."; exit 1 + log "$(basename $0): line $lineno: $msg"; $cmd; log "QUIT."; exit 1 } ### quit function if simulation is done diff --git a/HEN_HOUSE/scripts/egs-parallel-pbsdsh b/HEN_HOUSE/scripts/egs-parallel-pbsdsh index bec77b94f..1086da156 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbsdsh +++ b/HEN_HOUSE/scripts/egs-parallel-pbsdsh @@ -51,7 +51,7 @@ function quit { *) cmd="";; esac verbosity="verbose" - log "$0 line $lineno: $msg"; $cmd; log "QUIT."; exit 1 + log "$(basename $0): line $lineno: $msg"; $cmd; log "QUIT."; exit 1 } ### parse command-line arguments (simplistic) From c48b3431d54596501da41112898ea8a11ce67184 Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Thu, 27 Aug 2020 16:03:07 -0400 Subject: [PATCH 14/21] Add standard EGSnrc header to egs-parallel scripts --- HEN_HOUSE/scripts/bin/egs-parallel | 32 ++++++++++++++++++++++ HEN_HOUSE/scripts/bin/egs-parallel-clean | 28 +++++++++++++++++++ HEN_HOUSE/scripts/egs-parallel-cpu | 35 +++++++++++++++++++++++- HEN_HOUSE/scripts/egs-parallel-dshtask | 33 ++++++++++++++++++++++ HEN_HOUSE/scripts/egs-parallel-pbs | 35 +++++++++++++++++++++++- HEN_HOUSE/scripts/egs-parallel-pbsdsh | 35 +++++++++++++++++++++++- 6 files changed, 195 insertions(+), 3 deletions(-) diff --git a/HEN_HOUSE/scripts/bin/egs-parallel b/HEN_HOUSE/scripts/bin/egs-parallel index a7e98248b..e1e9444ed 100755 --- a/HEN_HOUSE/scripts/bin/egs-parallel +++ b/HEN_HOUSE/scripts/bin/egs-parallel @@ -1,4 +1,32 @@ #!/bin/bash +############################################################################### +# +# EGSnrc script to submit parallel jobs +# Copyright (C) 2020 National Research Council Canada +# +# This file is part of EGSnrc. +# +# EGSnrc is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# EGSnrc is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for +# more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EGSnrc. If not, see . +# +############################################################################### +# +# Author: Frederic Tessier, 2020 +# +# Contributors: +# +############################################################################### + ### help function function help { @@ -20,6 +48,10 @@ function help { -v | --verbose echo detailed egs-parallel log messages to terminal -c | --command command to run, given in quotes + example: + + $(basename $0) --batch cpu -q short -n12 -v -c 'egs_chamber -i slab -p 521icru' + EOF } diff --git a/HEN_HOUSE/scripts/bin/egs-parallel-clean b/HEN_HOUSE/scripts/bin/egs-parallel-clean index 99789d9d7..b5cd6b25c 100755 --- a/HEN_HOUSE/scripts/bin/egs-parallel-clean +++ b/HEN_HOUSE/scripts/bin/egs-parallel-clean @@ -1,4 +1,32 @@ #!/bin/bash +############################################################################### +# +# EGSnrc script to clean up temporary files and logs from parallel jobs +# Copyright (C) 2020 National Research Council Canada +# +# This file is part of EGSnrc. +# +# EGSnrc is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# EGSnrc is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for +# more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EGSnrc. If not, see . +# +############################################################################### +# +# Author: Frederic Tessier, 2020 +# +# Contributors: +# +############################################################################### + ### help function function help { diff --git a/HEN_HOUSE/scripts/egs-parallel-cpu b/HEN_HOUSE/scripts/egs-parallel-cpu index 0580c7df6..b790e85e0 100755 --- a/HEN_HOUSE/scripts/egs-parallel-cpu +++ b/HEN_HOUSE/scripts/egs-parallel-cpu @@ -1,4 +1,37 @@ #!/bin/bash +############################################################################### +# +# EGSnrc script to submit parallel jobs on a multicore cpu +# Copyright (C) 2020 National Research Council Canada +# +# This file is part of EGSnrc. +# +# EGSnrc is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# EGSnrc is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for +# more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EGSnrc. If not, see . +# +############################################################################### +# +# Author: Frederic Tessier, 2020 +# +# Contributors: +# +############################################################################### +# +# This script is not meant to be called directly, but rather via the script +# egs-parallel, with the batch option "--batch cpu" +# +############################################################################### + ### help function function help { @@ -23,7 +56,7 @@ function help { note: This script is not meant to be called directly, but rather via the - egs-parallel script with the batch option "-b cpu" + egs-parallel script with the batch option "--batch cpu" EOF } diff --git a/HEN_HOUSE/scripts/egs-parallel-dshtask b/HEN_HOUSE/scripts/egs-parallel-dshtask index 1da7abf43..fd72441ed 100755 --- a/HEN_HOUSE/scripts/egs-parallel-dshtask +++ b/HEN_HOUSE/scripts/egs-parallel-dshtask @@ -1,4 +1,37 @@ #!/bin/bash +############################################################################### +# +# EGSnrc script to run individual pbsdsh tasks +# Copyright (C) 2020 National Research Council Canada +# +# This file is part of EGSnrc. +# +# EGSnrc is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# EGSnrc is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for +# more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EGSnrc. If not, see . +# +############################################################################### +# +# Author: Frederic Tessier, 2020 +# +# Contributors: +# +############################################################################### +# +# This script is not meant to be called directly, but rather from the script +# egs-parallel-pbsdsh (PBS distributed shell) +# +############################################################################### + ### help function function help { diff --git a/HEN_HOUSE/scripts/egs-parallel-pbs b/HEN_HOUSE/scripts/egs-parallel-pbs index 5fc4edf6d..cc9a39ea7 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbs +++ b/HEN_HOUSE/scripts/egs-parallel-pbs @@ -1,4 +1,37 @@ #!/bin/bash +############################################################################### +# +# EGSnrc script to submit parallel jobs as individual PBS jobs +# Copyright (C) 2020 National Research Council Canada +# +# This file is part of EGSnrc. +# +# EGSnrc is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# EGSnrc is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for +# more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EGSnrc. If not, see . +# +############################################################################### +# +# Author: Frederic Tessier, 2020 +# +# Contributors: +# +############################################################################### +# +# This script is not meant to be called directly, but rather via the script +# egs-parallel, with the batch option "--batch pbs" +# +############################################################################### + ### help function function help { @@ -23,7 +56,7 @@ function help { note: This script is not meant to be called directly, but rather via the - egs-parallel script with the batch option "-b pbs" + egs-parallel script with the batch option "--batch pbs" EOF } diff --git a/HEN_HOUSE/scripts/egs-parallel-pbsdsh b/HEN_HOUSE/scripts/egs-parallel-pbsdsh index 1086da156..6ebe98f81 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbsdsh +++ b/HEN_HOUSE/scripts/egs-parallel-pbsdsh @@ -1,4 +1,37 @@ #!/bin/bash +############################################################################### +# +# EGSnrc script to submit parallel jobs under one PBS distributed shell job +# Copyright (C) 2020 National Research Council Canada +# +# This file is part of EGSnrc. +# +# EGSnrc is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# EGSnrc is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for +# more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with EGSnrc. If not, see . +# +############################################################################### +# +# Author: Frederic Tessier, 2020 +# +# Contributors: +# +############################################################################### +# +# This script is not meant to be called directly, but rather via the script +# egs-parallel, with the batch option "--batch pbsdsh" +# +############################################################################### + ### help function function help { @@ -23,7 +56,7 @@ function help { note: This script is not meant to be called directly, but rather via the - egs-parallel script with the batch option "-b pbsdsh" + egs-parallel script with the batch option "--batch pbsdsh" EOF } From ad2d8f944e3ed05bf60801980249943c91db487e Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Fri, 28 Aug 2020 10:16:53 -0400 Subject: [PATCH 15/21] Change default batch system to cpu in egs-parallel Change the initial value of the --batch option to "cpu" so that the script invokes the multicore parallel sub-script (egs-parallel-cpu) when no --batch option is specified on the command line. This allows users to try egs-parallel out of the box (most computers are multicore nowadays) without worrying about schedulers. --- HEN_HOUSE/scripts/bin/egs-parallel | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HEN_HOUSE/scripts/bin/egs-parallel b/HEN_HOUSE/scripts/bin/egs-parallel index e1e9444ed..f6dac6737 100755 --- a/HEN_HOUSE/scripts/bin/egs-parallel +++ b/HEN_HOUSE/scripts/bin/egs-parallel @@ -40,7 +40,7 @@ function help { options: -h | --help show this help - -b | --batch batch system to use ("pbsdsh" by default) + -b | --batch batch system to use ("cpu" by default) -d | --delay delay in seconds between individual jobs -q | --queue scheduler queue ("long" by default) -n | --nthread number of threads ("8" by default) @@ -85,7 +85,7 @@ function quit { exec 3>egs-parallel-$$.log ### default option values -opt_batch="pbsdsh" +opt_batch="cpu" opt_queue="long" opt_nthread="8" opt_delay="0" From 43137ffb2b86d6b54a880efeb83168e49863e818 Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Sat, 29 Aug 2020 10:28:48 -0400 Subject: [PATCH 16/21] Adjust names of cleaned egs-parallel output files --- HEN_HOUSE/scripts/bin/egs-parallel-clean | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/HEN_HOUSE/scripts/bin/egs-parallel-clean b/HEN_HOUSE/scripts/bin/egs-parallel-clean index b5cd6b25c..a96fb5c5d 100755 --- a/HEN_HOUSE/scripts/bin/egs-parallel-clean +++ b/HEN_HOUSE/scripts/bin/egs-parallel-clean @@ -163,7 +163,7 @@ if [ -z "$names" ]; then fi ### define filename extension to clean -clean_extensions=".lock .mederr .egsjob .pbsdsh .eo .e .o" +clean_extensions=".lock .mederr .eo .e .o .pbsdsh .egsjob .egsparallel" ### loop over all names for name in "${opt_names[@]}"; do @@ -184,16 +184,15 @@ for name in "${opt_names[@]}"; do ### clean .egsparallel and .eo log files if -x (--extra) is invoked, otherwise catenate them if [ "$opt_extra" = "yes" ]; then - clean_extensions="$clean_extensions .egsparallel .egsparallel-eo" + clean_extensions="$clean_extensions .egsparallel-log .egsparallel-out" else for f in ${name}.egsparallel; do f=${f%.egsparallel} files=($f.egsjob $f.o $f.eo ${f}_w*.o ${f}_w*.eo) lsfiles=$(/bin/ls -d ${files[@]} 2>/dev/null) if ! [ -z "$lsfiles" ]; then - action --log "merge parallel output and error streams into $f.egsparallel-eo" "cat $f.egsparallel ${files[@]} 2>/dev/null | grep -v 'EGSnrc egs-parallel' >$f.egsparallel-eo" - action --log "merge egs-parallel log messages into $f.egsparallel" "cat $f.egsparallel ${files[@]} 2>/dev/null | grep 'EGSnrc egs-parallel' | sort >$f.egsparallel.merged" - action --log --none "mv $f.egsparallel.merged $f.egsparallel" + action --log "create $f.egsparallel-log (merged egs-parallel log messages)" "cat $f.egsparallel ${files[@]} 2>/dev/null | grep 'EGSnrc egs-parallel' | sort >$f.egsparallel-log" + action --log "create $f.egsparallel-out (merged parallel jobs output streams)" "cat $f.egsparallel ${files[@]} 2>/dev/null | grep -v 'EGSnrc egs-parallel' >$f.egsparallel-out" fi done fi From c9ff999270668faad6c47735ef221a2709f77b52 Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Mon, 31 Aug 2020 09:21:53 -0400 Subject: [PATCH 17/21] Remove dependencies on lock file in egs-parallel Don't quit the egs-parallel submit scripts if no lock file is found, and add a -f (--force) option to override existing .egsjob or .lock files. The lock file for parallel jobs is managed inside EGSnrc, so the script should not manage it as well: this creates an obscure correlation between the code and the script. Moreover, the uniform run control method does no create a lock file. Previously, the submit script would quit if there was no lock file. The top-level egs-parallel script now prevents the run if there is an .egsjob file OR a .lock file, for the same reason. This can be overridden with the added --force option. --- HEN_HOUSE/scripts/bin/egs-parallel | 32 +++++++++++++++------ HEN_HOUSE/scripts/bin/egs-parallel-clean | 2 +- HEN_HOUSE/scripts/egs-parallel-cpu | 10 +++++++ HEN_HOUSE/scripts/egs-parallel-dshtask | 27 ++++-------------- HEN_HOUSE/scripts/egs-parallel-pbs | 36 +++++++++--------------- HEN_HOUSE/scripts/egs-parallel-pbsdsh | 18 ++++++++++-- 6 files changed, 68 insertions(+), 57 deletions(-) diff --git a/HEN_HOUSE/scripts/bin/egs-parallel b/HEN_HOUSE/scripts/bin/egs-parallel index f6dac6737..2cf60adc8 100755 --- a/HEN_HOUSE/scripts/bin/egs-parallel +++ b/HEN_HOUSE/scripts/bin/egs-parallel @@ -45,6 +45,7 @@ function help { -q | --queue scheduler queue ("long" by default) -n | --nthread number of threads ("8" by default) -o | --option option(s) to pass to job scheduler, in quotes + -f | --force force run, even if lock or egsjob file present -v | --verbose echo detailed egs-parallel log messages to terminal -c | --command command to run, given in quotes @@ -55,7 +56,7 @@ function help { EOF } -### timestamp functionq +### timestamp function function timestamp { printf "EGSnrc egs-parallel $(date -u "+%Y-%m-%d (UTC) %H:%M:%S.%N")" } @@ -91,6 +92,7 @@ opt_nthread="8" opt_delay="0" opt_command="" opt_options="" +opt_force="no" verbosity="silent" declare -a opt_options_array @@ -103,6 +105,7 @@ while [ "$#" -gt 0 ]; do # options without arguments case $opt in -h|--help) help; exit;; + -f|--force) opt_force="yes"; continue;; -v|--verbose) verbosity="verbose"; continue;; esac @@ -182,17 +185,17 @@ while [[ "$#" -gt 0 ]]; do fi done -### check first job index +### check that first job index is an integer if ! [[ $cmd_first =~ ^[0-9]+$ ]] ; then quit $LINENO "first job index (-f option) is not an integer: $opt_cmd" fi -### check that an egs input filename is provided and set basename accordingly +### check that an egs input filename is provided if [ -z "$cmd_input" ]; then quit $LINENO "missing input file (-i option) in run command: $opt_cmd" fi -### set simulation basename +### set simulation basename from input file basename="$(basename "$cmd_input" .egsinp)" ### check that the egs input file exists and is readable @@ -201,10 +204,23 @@ if ! [ -r $egsinp ]; then quit $LINENO "cannot access input file: $egsinp.egsinp" fi -### check that there is not currently a .lock file associated with this input file -lock=$egs_home/$cmd_app/$basename.lock -if [ -e $lock ]; then - quit $LINENO "there is already a lock file for $basename: $lock" +### prevent the run if lock file or egsjob file present (unless forced) +if [ "$opt_force" == "no" ]; then + + # check lock file + lock=$egs_home/$cmd_app/$basename.lock + if [ -e $lock ]; then + log "existing lock file: $lock" + quit $LINENO "prevent erasing lock file (override with --force)" + fi + + # check egsjob file + egsjob=$egs_home/$cmd_app/$basename.egsjob + if [ -e $egsjob ]; then + log "existing egsjob file: $egsjob" + quit $LINENO "prevent erasing egsjob file (override with --force)" + fi + fi ### report command-line options diff --git a/HEN_HOUSE/scripts/bin/egs-parallel-clean b/HEN_HOUSE/scripts/bin/egs-parallel-clean index a96fb5c5d..fc01b3cb1 100755 --- a/HEN_HOUSE/scripts/bin/egs-parallel-clean +++ b/HEN_HOUSE/scripts/bin/egs-parallel-clean @@ -184,7 +184,7 @@ for name in "${opt_names[@]}"; do ### clean .egsparallel and .eo log files if -x (--extra) is invoked, otherwise catenate them if [ "$opt_extra" = "yes" ]; then - clean_extensions="$clean_extensions .egsparallel-log .egsparallel-out" + clean_extensions="$clean_extensions .egsparallel .egsparallel-log .egsparallel-out" else for f in ${name}.egsparallel; do f=${f%.egsparallel} diff --git a/HEN_HOUSE/scripts/egs-parallel-cpu b/HEN_HOUSE/scripts/egs-parallel-cpu index b790e85e0..471ab7ab9 100755 --- a/HEN_HOUSE/scripts/egs-parallel-cpu +++ b/HEN_HOUSE/scripts/egs-parallel-cpu @@ -117,6 +117,16 @@ if [ $nthread -gt $cpu_nthread ]; then nthread=$cpu_nthread fi +### remove existing egsjob and lock files +if [ -e $basename.egsjob ]; then + log "remove existing egsjob file: $basename.egsjob" + /bin/rm $basename.egsjob +fi +if [ -e $basename.lock ]; then + log "remove existing lock file: $basename.lock" + /bin/rm $basename.lock +fi + ### loop to launch nthread jobs on cpu for job in $(seq 1 $nthread); do diff --git a/HEN_HOUSE/scripts/egs-parallel-dshtask b/HEN_HOUSE/scripts/egs-parallel-dshtask index fd72441ed..e4d385c1f 100755 --- a/HEN_HOUSE/scripts/egs-parallel-dshtask +++ b/HEN_HOUSE/scripts/egs-parallel-dshtask @@ -171,6 +171,11 @@ else delta=10 limit=120 while [ ! -e $basename.egsjob ]; do + + # quit if simulation is already done + quit_if_done + + # otherwise wait for egsjob file log "$jobstr: wait $delta seconds (no $basename.egsjob file after $total seconds)" sleep $delta total=$((total+$delta)) @@ -183,25 +188,6 @@ else # quit if simulation is already done quit_if_done - # sleep until there is a lock file, maximum 300 seconds - total=0 - delta=10 - limit=300 - while [ ! -e $basename.lock ]; do - - # quit if simulation is already done - quit_if_done - - # otherwise wait for lock file - log "$jobstr: wait $delta seconds ($basename.lock not found after $total seconds)" - sleep $delta - total=$((total+$delta)) - if [ $total -gt $limit ]; then - log "$jobstr: QUIT ($basename.lock not found after $limit seconds)" - exit - fi - done - # offset all jobs by a fixed delay (relative to previous job) delta=100000 log "$jobstr: wait $((job*$delta)) microseconds (default job offset delay)" @@ -224,9 +210,6 @@ else if [ -r $basename.lock ]; then content=$(cat $basename.lock) log "$jobstr: found $basename.lock: $content" - else - log "$jobstr: QUIT ($basename.lock does not exist or is not readable)" - exit fi fi diff --git a/HEN_HOUSE/scripts/egs-parallel-pbs b/HEN_HOUSE/scripts/egs-parallel-pbs index cc9a39ea7..2100ce3a1 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbs +++ b/HEN_HOUSE/scripts/egs-parallel-pbs @@ -126,6 +126,16 @@ if [ $trim -gt 0 ]; then fi log "job name: $jobname" +### remove existing egsjob and lock files +if [ -e $basename.egsjob ]; then + log "remove existing egsjob file: $basename.egsjob" + /bin/rm $basename.egsjob +fi +if [ -e $basename.lock ]; then + log "remove existing lock file: $basename.lock" + /bin/rm $basename.lock +fi + ### loop to launch nthread pbs jobs for job in $(seq 1 $nthread); do @@ -145,33 +155,16 @@ for job in $(seq 1 $nthread); do delta=10 limit=120 while [ ! -e $basename.egsjob ]; do - log "$jobstr: wait $delta seconds (no $basename.egsjob file after $total seconds)" - sleep $delta - total=$((total+$delta)) - if [ $total -gt $limit ]; then - log "$jobstr: QUIT (no $basename.egsjob file after $limit seconds)" - exit - fi - done - - # quit if simulation is already done - quit_if_done - - # sleep until there is a lock file, maximum 300 seconds - total=0 - delta=10 - limit=300 - while [ ! -e $basename.lock ]; do # quit if simulation is already done quit_if_done - # otherwise wait for lock file - log "$jobstr: wait $delta seconds ($basename.lock not found after $total seconds)" + # otherwise wait for egsjob file + log "$jobstr: wait $delta seconds (no $basename.egsjob file after $total seconds)" sleep $delta total=$((total+$delta)) if [ $total -gt $limit ]; then - log "$jobstr: QUIT ($basename.lock not found after $limit seconds)" + log "$jobstr: QUIT (no $basename.egsjob file after $limit seconds)" exit fi done @@ -201,9 +194,6 @@ for job in $(seq 1 $nthread); do if [ -r $basename.lock ]; then content=$(cat $basename.lock) log "$jobstr: found $basename.lock: $content" - else - log "$jobstr: QUIT ($basename.lock does not exist or is not readable)" - exit fi quit_if_done fi diff --git a/HEN_HOUSE/scripts/egs-parallel-pbsdsh b/HEN_HOUSE/scripts/egs-parallel-pbsdsh index 6ebe98f81..032911430 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbsdsh +++ b/HEN_HOUSE/scripts/egs-parallel-pbsdsh @@ -116,16 +116,28 @@ if [ $trim -gt 0 ]; then fi log "job name: $jobname" -### create pbsdsh directory to store task files for job numbers +### create pbsdsh directory to store task files for job numbers (remove existing directory) pbsdsh_dir=$basename.pbsdsh -log "create temporary directory $pbsdsh_dir" if [ -e $pbsdsh_dir ]; then + log "remove existing directory $pbsdsh_dir" /bin/rm -r $pbsdsh_dir fi +log "create temporary directory $pbsdsh_dir" err=$(mkdir $pbsdsh_dir 2>&1) if ! [ -z $err ]; then quit $LINENO "$err" fi + +### remove existing egsjob and lock files +if [ -e $basename.egsjob ]; then + log "remove existing egsjob file: $basename.egsjob" + /bin/rm $basename.egsjob +fi +if [ -e $basename.lock ]; then + log "remove existing lock file: $basename.lock" + /bin/rm $basename.lock +fi + ### launch pbsdsh tasks task_script=$HEN_HOUSE/scripts/egs-parallel-dshtask jobpid=$(qsub -q $queue $scheduler_options < Date: Tue, 1 Sep 2020 09:28:43 -0400 Subject: [PATCH 18/21] Detect failure to launch pbs job in egs-parallel Detect pbs jobs that fail to launch in egs-parallel, by looking at the echoed job pid: quit immediately if it is not an integer. If the first job fails, subsequent jobs are not launched. Report the failure in the log. Also adjust the format of a few log messages. --- HEN_HOUSE/scripts/bin/egs-parallel | 4 ++-- HEN_HOUSE/scripts/egs-parallel-dshtask | 3 ++- HEN_HOUSE/scripts/egs-parallel-pbs | 12 ++++++++++-- HEN_HOUSE/scripts/egs-parallel-pbsdsh | 9 +++++++-- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/HEN_HOUSE/scripts/bin/egs-parallel b/HEN_HOUSE/scripts/bin/egs-parallel index 2cf60adc8..73d04ac51 100755 --- a/HEN_HOUSE/scripts/bin/egs-parallel +++ b/HEN_HOUSE/scripts/bin/egs-parallel @@ -134,7 +134,7 @@ done opt_options="${opt_options_array[@]}" ### begin script -log "BEGIN $0" +log "BEGIN $(basename $0)" ### EGSnrc environment variables log "EGSnrc environment:" @@ -238,7 +238,7 @@ log " options = $opt_options" logfile=$egs_home/$cmd_app/$basename.egsparallel /bin/mv egs-parallel-$$.log $logfile exec 3>>$logfile -log "egs-parallel log: $logfile" +log "log file: $logfile" ### go to egs application directory log "cd $egs_home/$cmd_app" diff --git a/HEN_HOUSE/scripts/egs-parallel-dshtask b/HEN_HOUSE/scripts/egs-parallel-dshtask index e4d385c1f..16c62187a 100755 --- a/HEN_HOUSE/scripts/egs-parallel-dshtask +++ b/HEN_HOUSE/scripts/egs-parallel-dshtask @@ -151,7 +151,8 @@ jobstr=$(printf "job %04d" $job) log "$jobstr <- $taskstr" # log the host and pid of this job -log "$jobstr: host=$(hostname) BEGIN pid=$$" +log "$jobstr: host=$(hostname)" +log "$jobstr: BEGIN pid=$$" ### manage jobs to avoid bottleneck and race conditions if [ $job -eq 1 ]; then diff --git a/HEN_HOUSE/scripts/egs-parallel-pbs b/HEN_HOUSE/scripts/egs-parallel-pbs index 2100ce3a1..6fc11b5d7 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbs +++ b/HEN_HOUSE/scripts/egs-parallel-pbs @@ -199,9 +199,11 @@ for job in $(seq 1 $nthread); do fi ### launch the job + pbscommand="qsub -q $queue $scheduler_options" runcommand="$command -b -P $nthread -j $job -f $first" - log "$jobstr: SUBMIT $runcommand" - jobpid=$(qsub -q $queue $scheduler_options < Date: Fri, 26 Feb 2021 11:52:26 -0500 Subject: [PATCH 19/21] Fix pbsdsh jobnames starting with a period Fix a crash that occurred when the 14 character truncation of the filename for an egs-parallel pbsdsh job ended up starting with a '.'. The first character is now trimmed away if that is the case, so the job name is only 13 characters. --- HEN_HOUSE/scripts/egs-parallel-pbs | 4 ++++ HEN_HOUSE/scripts/egs-parallel-pbsdsh | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/HEN_HOUSE/scripts/egs-parallel-pbs b/HEN_HOUSE/scripts/egs-parallel-pbs index 6fc11b5d7..ade847f9c 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbs +++ b/HEN_HOUSE/scripts/egs-parallel-pbs @@ -124,6 +124,10 @@ trim=$(( $(echo $jobname | wc -c) - 14 )) if [ $trim -gt 0 ]; then jobname=$(echo $jobname | cut -c $trim-) fi +### job names can't start with a '.', hopefully the next character is OK +if [[ ${jobname::1} == "." ]]; then + jobname="${jobname:1}" +fi log "job name: $jobname" ### remove existing egsjob and lock files diff --git a/HEN_HOUSE/scripts/egs-parallel-pbsdsh b/HEN_HOUSE/scripts/egs-parallel-pbsdsh index ff6873499..d0c0aa7a2 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbsdsh +++ b/HEN_HOUSE/scripts/egs-parallel-pbsdsh @@ -114,6 +114,10 @@ trim=$(( $(echo $jobname | wc -c) - 14 )) if [ $trim -gt 0 ]; then jobname=$(echo $jobname | cut -c $trim-) fi +### job names can't start with a '.', hopefully the next character is OK +if [[ ${jobname::1} == "." ]]; then + jobname="${jobname:1}" +fi log "job name: $jobname" ### create pbsdsh directory to store task files for job numbers (remove existing directory) From 8a7311f56b94ffa87e32474eb2c7944e687d1986 Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Fri, 26 Mar 2021 08:01:21 -0400 Subject: [PATCH 20/21] Strip non-alphanumeric lead chars in PBS job name Ensure that the PBS job name starts with an alphanumeric character [0-9A-Za-z], following the PBS scheduler requirement. To avoid failed jobs solely on the account of a bad job name, strip all leading non-alphanumeric characters from the job name. Note that the egsinp basename is not affected, this is strictly for the job name passed to qsub via the -N option. --- HEN_HOUSE/scripts/egs-parallel-pbs | 8 ++------ HEN_HOUSE/scripts/egs-parallel-pbsdsh | 8 ++------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/HEN_HOUSE/scripts/egs-parallel-pbs b/HEN_HOUSE/scripts/egs-parallel-pbs index ade847f9c..aff1d4dfd 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbs +++ b/HEN_HOUSE/scripts/egs-parallel-pbs @@ -118,16 +118,12 @@ verbosity=$8 ### link file descriptor 3 to egs-parallel log file exec 3>>$basename.egsparallel -### set scheduler job name (maximum 14 characters) -jobname=$(echo ${basename}[$nthread]) +### set scheduler job name (skip leading non alnum chars, maximum 14 characters) +jobname=$(echo "${basename}[$nthread]" | sed 's/^[^[:alnum:]]*//') trim=$(( $(echo $jobname | wc -c) - 14 )) if [ $trim -gt 0 ]; then jobname=$(echo $jobname | cut -c $trim-) fi -### job names can't start with a '.', hopefully the next character is OK -if [[ ${jobname::1} == "." ]]; then - jobname="${jobname:1}" -fi log "job name: $jobname" ### remove existing egsjob and lock files diff --git a/HEN_HOUSE/scripts/egs-parallel-pbsdsh b/HEN_HOUSE/scripts/egs-parallel-pbsdsh index d0c0aa7a2..cd4764a50 100755 --- a/HEN_HOUSE/scripts/egs-parallel-pbsdsh +++ b/HEN_HOUSE/scripts/egs-parallel-pbsdsh @@ -108,16 +108,12 @@ exec 3>>$basename.egsparallel ### begin script log "BEGIN $0" -### set scheduler job name (maximum 14 characters) -jobname=$(echo ${basename}[$nthread]) +### set scheduler job name (skip leading non alnum chars, maximum 14 characters) +jobname=$(echo "${basename}[$nthread]" | sed 's/^[^[:alnum:]]*//') trim=$(( $(echo $jobname | wc -c) - 14 )) if [ $trim -gt 0 ]; then jobname=$(echo $jobname | cut -c $trim-) fi -### job names can't start with a '.', hopefully the next character is OK -if [[ ${jobname::1} == "." ]]; then - jobname="${jobname:1}" -fi log "job name: $jobname" ### create pbsdsh directory to store task files for job numbers (remove existing directory) From 01b86ac04bc0490b4fff99f47c5582fff705a4b0 Mon Sep 17 00:00:00 2001 From: Frederic Tessier Date: Wed, 31 Mar 2021 07:13:18 -0400 Subject: [PATCH 21/21] Add command to get thread count on Darwin system --- HEN_HOUSE/scripts/egs-parallel-cpu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/HEN_HOUSE/scripts/egs-parallel-cpu b/HEN_HOUSE/scripts/egs-parallel-cpu index 471ab7ab9..a37d7cee7 100755 --- a/HEN_HOUSE/scripts/egs-parallel-cpu +++ b/HEN_HOUSE/scripts/egs-parallel-cpu @@ -111,7 +111,11 @@ exec 3>>$basename.egsparallel log "BEGIN $0" ### restrict number of jobs to the number of cpu threads -cpu_nthread=$(grep -c processor /proc/cpuinfo) +os_name="$(uname -s)" +case "$os_name" in + Darwin) cpu_nthread=$(sysctl -n hw.ncpu);; + *) cpu_nthread=$(grep -c processor /proc/cpuinfo);; +esac if [ $nthread -gt $cpu_nthread ]; then log "reduce requested threads ($nthread) to match available cpu threads ($cpu_nthread)" nthread=$cpu_nthread