Skip to content

Commit

Permalink
Merge branch 'ACME-Climate/jgfouca/cime/perf_data_fixes' (PR #1201)
Browse files Browse the repository at this point in the history
Provide several fixes for the performance archiving to work with CIME5
 
Timing directory in RUNDIR needs to have lid appended
Create timing.$LID.saved
Do not include full directory hierarchy in tar files
Copy all timing files to performance archive
Restore support for mach_syslog
In the case timing subdirectory, stop compressing the performance summary file and
start compressing the raw global statistics timing data.
For create_test, if save_timing is false, be sure to set SAVE_TIMING to False
    
Fixes #1177
    
[BFB]
  • Loading branch information
rljacob authored Jan 26, 2017
2 parents 3dba187 + 2a2fc7f commit c46e8f0
Show file tree
Hide file tree
Showing 13 changed files with 202 additions and 68 deletions.
2 changes: 1 addition & 1 deletion cime_config/acme/machines/config_batch.xml
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@
<directives>
<directive> --job-name={{ job_id }}</directive>
<directive> --nodes={{ num_nodes }}</directive>
<directive> --output={{ output_error_path }} </directive>
<directive> --output={{ output_error_path }}.%j </directive>
<directive> --exclusive </directive>
<directive> --time={{ job_wallclock_time }}</directive>
<directive> --partition={{ job_queue }}</directive>
Expand Down
10 changes: 5 additions & 5 deletions cime_config/acme/machines/syslog.cetus
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ set timing = $5
set dir = $6

# wait until output file is nonempty before checking remaining time
# (note that calling script 'touch'es the cesm log file before spawning this script, so that 'wc' does not fail)
# (note that calling script 'touch'es the acme log file before spawning this script, so that 'wc' does not fail)
set outlth = 0
while ($outlth < 1)
sleep 10
set outlth = `wc \-l $run/cesm.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
set outlth = `wc \-l $run/acme.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
end

set TimeRemaining = `qstat -lf $jid | grep TimeRemaining | sed 's/^ *TimeRemaining *: *\([0-9]*:[0-9]*:[0-9]*\) */\1/' `
Expand All @@ -29,16 +29,16 @@ if ("X$rem_secs" == "X") set rem_secs = 0
cat > $run/Walltime.Remaining <<EOF1
$remaining $sample_interval
EOF1
/bin/cp -p $run/cesm.log.$lid $dir/cesm.log.$lid.$remaining
/bin/cp --preserve=timestamps $run/acme.log.$lid $dir/acme.log.$lid.$remaining

while ($remaining > 0)
grep -a -i -e "nstep" -e "model date" $run/*atm.log.$lid | tail > $dir/atm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/cesm.log.$lid | tail > $dir/cesm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/acme.log.$lid | tail > $dir/acme.log.$lid.nstep.$remaining
grep -a -i -e "timestep" -e "model date" $run/*lnd.log.$lid | tail > $dir/lnd.log.$lid.timestep.$remaining
grep -a -i -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail > $dir/ocn.log.$lid.stepnum.$remaining
grep -a -i -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail > $dir/ice.log.$lid.istep.$remaining
grep -a -i "model date" $run/*cpl.log.$lid | tail > $dir/cpl.log.$lid.modeldata.$remaining
cp -p -u $timing/* $dir
/bin/cp --preserve=timestamps -u $timing/* $dir
chmod a+r $dir/*
sleep $sample_interval
set TimeRemaining = `qstat -lf $jid | grep TimeRemaining | sed 's/^ *TimeRemaining *: *\([0-9]*:[0-9]*:[0-9]*\) */\1/' `
Expand Down
72 changes: 72 additions & 0 deletions cime_config/acme/machines/syslog.cori-haswell
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/csh -f
# cori-haswell syslog script:
# mach_syslog <sampling interval (in seconds)> <job identifier> <time stamp> <run directory> <timing directory> <output directory>

set sec = 0
set sample_interval = $1
set jid = $2
set lid = $3
set run = $4
set timing = $5
set dir = $6

# wait until job mapping information is output before saving output file
# (note that calling script 'touch'es the acme log file before spawning this script, so that 'wc' does not fail)
set nnodes = `sqs -f $jid | grep NumNodes | sed 's/^ *NumNodes= *\([0-9]*\).*/\1/' `
set outlth = 0
while ($outlth < $nnodes)
sleep 10
set outlth = `wc \-l $run/acme.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
end

set TimeLimit = `sqs -f $jid | grep TimeLimit | sed 's/^ *RunTime=.*TimeLimit=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set limit_hours = `echo $TimeLimit | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set limit_mins = `echo $TimeLimit | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set limit_secs = `echo $TimeLimit | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
if ("X$limit_hours" == "X") set limit_hours = 0
if ("X$limit_mins" == "X") set limit_mins = 0
if ("X$limit_secs" == "X") set limit_secs = 0
@ limit = 3600 * $limit_hours + 60 * $limit_mins + $limit_secs

set RunTime = `sqs -f $jid | grep RunTime | sed 's/^ *RunTime=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set runt_hours = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set runt_mins = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set runt_secs = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
if ("X$runt_hours" == "X") set runt_hours = 0
if ("X$runt_mins" == "X") set runt_mins = 0
if ("X$runt_secs" == "X") set runt_secs = 0
@ runt = 3600 * $runt_hours + 60 * $runt_mins + $runt_secs

@ remaining = $limit - $runt
cat > $run/Walltime.Remaining <<EOF1
$remaining $sample_interval
EOF1
/bin/cp --preserve=timestamps $run/acme.log.$lid $dir/acme.log.$lid.$remaining

while ($remaining > 0)
grep -a -i -e "nstep" -e "model date" $run/*atm.log.$lid | tail > $dir/atm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/acme.log.$lid | tail > $dir/acme.log.$lid.nstep.$remaining
grep -a -i -e "timestep" -e "model date" $run/*lnd.log.$lid | tail > $dir/lnd.log.$lid.timestep.$remaining
grep -a -i -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail > $dir/ocn.log.$lid.stepnum.$remaining
grep -a -i -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail > $dir/ice.log.$lid.istep.$remaining
grep -a -i "model date" $run/*cpl.log.$lid | tail > $dir/cpl.log.$lid.modeldata.$remaining
/bin/cp --preserve=timestamps -u $timing/* $dir
# xtnodestat > $dir/xtnodestat.$lid.$remaining
sqs -w -a > $dir/sqsw.$lid.$remaining
chmod a+r $dir/*
sleep $sample_interval
set RunTime = `sqs -f $jid | grep RunTime | sed 's/^ *RunTime=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
set runt_hours = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\1/' `
set runt_mins = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\2/' `
set runt_secs = `echo $RunTime | sed 's/^0*\([0-9]*\):0*\([0-9]*\):0*\([0-9]*\)/\3/' `
if ("X$runt_hours" == "X") set runt_hours = 0
if ("X$runt_mins" == "X") set runt_mins = 0
if ("X$runt_secs" == "X") set runt_secs = 0
@ runt = 3600 * $runt_hours + 60 * $runt_mins + $runt_secs
@ remaining = $limit - $runt
cat > $run/Walltime.Remaining << EOF2
$remaining $sample_interval
EOF2

end

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/csh -f
# corip1 syslog script:
# cori-knl syslog script:
# mach_syslog <sampling interval (in seconds)> <job identifier> <time stamp> <run directory> <timing directory> <output directory>

set sec = 0
Expand All @@ -11,12 +11,12 @@ set timing = $5
set dir = $6

# wait until job mapping information is output before saving output file
# (note that calling script 'touch'es the cesm log file before spawning this script, so that 'wc' does not fail)
# (note that calling script 'touch'es the acme log file before spawning this script, so that 'wc' does not fail)
set nnodes = `sqs -f $jid | grep NumNodes | sed 's/^ *NumNodes= *\([0-9]*\).*/\1/' `
set outlth = 0
while ($outlth < $nnodes)
sleep 10
set outlth = `wc \-l $run/cesm.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
set outlth = `wc \-l $run/acme.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
end

set TimeLimit = `sqs -f $jid | grep TimeLimit | sed 's/^ *RunTime=.*TimeLimit=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
Expand All @@ -41,16 +41,16 @@ if ("X$runt_secs" == "X") set runt_secs = 0
cat > $run/Walltime.Remaining <<EOF1
$remaining $sample_interval
EOF1
/bin/cp -p $run/cesm.log.$lid $dir/cesm.log.$lid.$remaining
/bin/cp --preserve=timestamps $run/acme.log.$lid $dir/acme.log.$lid.$remaining

while ($remaining > 0)
grep -a -i -e "nstep" -e "model date" $run/*atm.log.$lid | tail > $dir/atm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/cesm.log.$lid | tail > $dir/cesm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/acme.log.$lid | tail > $dir/acme.log.$lid.nstep.$remaining
grep -a -i -e "timestep" -e "model date" $run/*lnd.log.$lid | tail > $dir/lnd.log.$lid.timestep.$remaining
grep -a -i -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail > $dir/ocn.log.$lid.stepnum.$remaining
grep -a -i -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail > $dir/ice.log.$lid.istep.$remaining
grep -a -i "model date" $run/*cpl.log.$lid | tail > $dir/cpl.log.$lid.modeldata.$remaining
cp -p -u $timing/* $dir
/bin/cp --preserve=timestamps -u $timing/* $dir
# xtnodestat > $dir/xtnodestat.$lid.$remaining
sqs -w -a > $dir/sqsw.$lid.$remaining
chmod a+r $dir/*
Expand Down
10 changes: 5 additions & 5 deletions cime_config/acme/machines/syslog.edison
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ set timing = $5
set dir = $6

# wait until job mapping information is output before saving output file
# (note that calling script 'touch'es the cesm log file before spawning this script, so that 'wc' does not fail)
# (note that calling script 'touch'es the acme log file before spawning this script, so that 'wc' does not fail)
set nnodes = `sqs -f $jid | grep NumNodes | sed 's/^ *NumNodes= *\([0-9]*\).*/\1/' `
set outlth = 0
while ($outlth < $nnodes)
sleep 10
set outlth = `wc \-l $run/cesm.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
set outlth = `wc \-l $run/acme.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
end

set TimeLimit = `sqs -f $jid | grep TimeLimit | sed 's/^ *RunTime=.*TimeLimit=\([0-9]*:[0-9]*:[0-9]*\) .*/\1/' `
Expand All @@ -41,16 +41,16 @@ if ("X$runt_secs" == "X") set runt_secs = 0
cat > $run/Walltime.Remaining <<EOF1
$remaining $sample_interval
EOF1
/bin/cp -p $run/cesm.log.$lid $dir/cesm.log.$lid.$remaining
/bin/cp --preserve=timestamps $run/acme.log.$lid $dir/acme.log.$lid.$remaining

while ($remaining > 0)
grep -a -i -e "nstep" -e "model date" $run/*atm.log.$lid | tail > $dir/atm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/cesm.log.$lid | tail > $dir/cesm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/acme.log.$lid | tail > $dir/acme.log.$lid.nstep.$remaining
grep -a -i -e "timestep" -e "model date" $run/*lnd.log.$lid | tail > $dir/lnd.log.$lid.timestep.$remaining
grep -a -i -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail > $dir/ocn.log.$lid.stepnum.$remaining
grep -a -i -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail > $dir/ice.log.$lid.istep.$remaining
grep -a -i "model date" $run/*cpl.log.$lid | tail > $dir/cpl.log.$lid.modeldata.$remaining
cp -p -u $timing/* $dir
/bin/cp --preserve=timestamps -p -u $timing/* $dir
# xtnodestat > $dir/xtnodestat.$lid.$remaining
sqs -w -a > $dir/sqsw.$lid.$remaining
chmod a+r $dir/*
Expand Down
10 changes: 5 additions & 5 deletions cime_config/acme/machines/syslog.mira
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ set timing = $5
set dir = $6

# wait until output file is nonempty before checking remaining time
# (note that calling script 'touch'es the cesm log file before spawning this script, so that 'wc' does not fail)
# (note that calling script 'touch'es the acme log file before spawning this script, so that 'wc' does not fail)
set outlth = 0
while ($outlth < 1)
sleep 10
set outlth = `wc \-l $run/cesm.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
set outlth = `wc \-l $run/acme.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
end

set TimeRemaining = `qstat -lf $jid | grep TimeRemaining | sed 's/^ *TimeRemaining *: *\([0-9]*:[0-9]*:[0-9]*\) */\1/' `
Expand All @@ -29,16 +29,16 @@ if ("X$rem_secs" == "X") set rem_secs = 0
cat > $run/Walltime.Remaining <<EOF1
$remaining $sample_interval
EOF1
/bin/cp -p $run/cesm.log.$lid $dir/cesm.log.$lid.$remaining
/bin/cp --preserve=timestamps $run/acme.log.$lid $dir/acme.log.$lid.$remaining

while ($remaining > 0)
grep -a -i -e "nstep" -e "model date" $run/*atm.log.$lid | tail > $dir/atm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/cesm.log.$lid | tail > $dir/cesm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/acme.log.$lid | tail > $dir/acme.log.$lid.nstep.$remaining
grep -a -i -e "timestep" -e "model date" $run/*lnd.log.$lid | tail > $dir/lnd.log.$lid.timestep.$remaining
grep -a -i -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail > $dir/ocn.log.$lid.stepnum.$remaining
grep -a -i -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail > $dir/ice.log.$lid.istep.$remaining
grep -a -i "model date" $run/*cpl.log.$lid | tail > $dir/cpl.log.$lid.modeldata.$remaining
cp -p -u $timing/* $dir
/bin/cp --preserve=timestamps -u $timing/* $dir
chmod a+r $dir/*
sleep $sample_interval
set TimeRemaining = `qstat -lf $jid | grep TimeRemaining | sed 's/^ *TimeRemaining *: *\([0-9]*:[0-9]*:[0-9]*\) */\1/' `
Expand Down
10 changes: 5 additions & 5 deletions cime_config/acme/machines/syslog.titan
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,27 @@ set timing = $5
set dir = $6

# wait until job mapping information is output before saving output file
# (note that calling script 'touch'es the cesm log file before spawning this script, so that 'wc' does not fail)
# (note that calling script 'touch'es the acme log file before spawning this script, so that 'wc' does not fail)
set nnodes = `qstat -f $jid | grep Resource_List.nodes | sed 's/ *Resource_List.nodes = *\([0-9]*\):ppn=*\([0-9]*\) */\1/' `
set outlth = 0
while ($outlth < $nnodes)
sleep 10
set outlth = `wc \-l $run/cesm.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
set outlth = `wc \-l $run/acme.log.$lid | sed 's/ *\([0-9]*\) *.*/\1/' `
end
set remaining = `qstat -f $jid | grep Walltime.Remaining | sed 's/ *Walltime.Remaining = *\([0-9]*\) */\1/' `
cat > $run/Walltime.Remaining <<EOF1
$remaining $sample_interval
EOF1
/bin/cp -p $run/cesm.log.$lid $dir/cesm.log.$lid.$remaining
/bin/cp --preserve=timestamps $run/acme.log.$lid $dir/acme.log.$lid.$remaining

while ($remaining > 0)
grep -a -i -e "nstep" -e "model date" $run/*atm.log.$lid | tail > $dir/atm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/cesm.log.$lid | tail > $dir/cesm.log.$lid.nstep.$remaining
# grep -a -i "nstep" $run/acme.log.$lid | tail > $dir/acme.log.$lid.nstep.$remaining
grep -a -i -e "timestep" -e "model date" $run/*lnd.log.$lid | tail > $dir/lnd.log.$lid.timestep.$remaining
grep -a -i -e "timestep" -e "Step number" -e "model date" $run/*ocn.log.$lid | tail > $dir/ocn.log.$lid.stepnum.$remaining
grep -a -i -e "timestep" -e "istep" -e "model date" $run/*ice.log.$lid | tail > $dir/ice.log.$lid.istep.$remaining
grep -a -i "model date" $run/*cpl.log.$lid | tail > $dir/cpl.log.$lid.modeldata.$remaining
cp -p -u $timing/* $dir
/bin/cp --preserve=timestamps -u $timing/* $dir
xtnodestat > $dir/xtnodestat.$lid.$remaining
showq > $dir/showq.$lid.$remaining
chmod a+r $dir/*
Expand Down
1 change: 1 addition & 0 deletions driver_cpl/cime_config/buildnml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def _main_func():
% (grid, atm_grid, lnd_grid, rof_grid, ocn_grid, wav_grid)

rc, out, err = run_cmd(cmd, from_dir=confdir)
logger.info(out)
expect(rc==0,"Command %s failed rc=%d\nout=%s\nerr=%s"%(cmd,rc,out,err))

# copy drv_in, drv_flds_in, seq_maps.rc and all *modio* files to rundir
Expand Down
9 changes: 8 additions & 1 deletion utils/python/CIME/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,7 @@ def _set_pio_xml(self):
def _create_caseroot_tools(self):
machines_dir = os.path.abspath(self.get_value("MACHDIR"))
toolsdir = os.path.join(self.get_value("CIMEROOT"),"scripts","Tools")
casetools = os.path.join(self._caseroot, "Tools")
# setup executable files in caseroot/
exefiles = (os.path.join(toolsdir, "case.setup"),
os.path.join(toolsdir, "case.build"),
Expand Down Expand Up @@ -721,7 +722,7 @@ def _create_caseroot_tools(self):
toolfiles.append( os.path.join(toolsdir,"mdiag_reduce.pl") )

for toolfile in toolfiles:
destfile = os.path.join(self._caseroot,"Tools",os.path.basename(toolfile))
destfile = os.path.join(casetools, os.path.basename(toolfile))
expect(os.path.isfile(toolfile)," File %s does not exist"%toolfile)
try:
os.symlink(toolfile, destfile)
Expand Down Expand Up @@ -764,6 +765,12 @@ def _create_caseroot_tools(self):
# except Exception as e:
# logger.warning("FAILED to set up infofiles: %s" % str(e))

if get_model() == "acme":
if os.path.exists(os.path.join(machines_dir, "syslog.%s" % machine)):
shutil.copy(os.path.join(machines_dir, "syslog.%s" % machine), os.path.join(casetools, "mach_syslog"))
else:
shutil.copy(os.path.join(machines_dir, "syslog.noop"), os.path.join(casetools, "mach_syslog"))

def _create_caseroot_sourcemods(self):
components = self.get_compset_components()
for component in components:
Expand Down
6 changes: 3 additions & 3 deletions utils/python/CIME/get_timing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from CIME.XML.standard_module_setup import *

import datetime, shutil, re, gzip
import datetime, shutil, re

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -160,7 +160,7 @@ def getTiming(self):
finfilename = os.path.join(self.caseroot, "timing",
"%s_timing_stats.%s" % (cime_model, self.lid))
foutfilename = os.path.join(self.caseroot, "timing",
"%s_timing.%s.%s.gz" % (cime_model, caseid, self.lid))
"%s_timing.%s.%s" % (cime_model, caseid, self.lid))

timingDir = os.path.join(self.caseroot, "timing")
if not os.path.isdir(timingDir):
Expand Down Expand Up @@ -215,7 +215,7 @@ def getTiming(self):
m.offset = int((maxoffset*m.rootpe)/peminmax) + extraoff
cpl.offset = 0
try:
self.fout = gzip.open(foutfilename, "wb")
self.fout = open(foutfilename, "w")
except Exception, e:
logger.critical("Could not open file for writing: %s"
% foutfilename)
Expand Down
Loading

0 comments on commit c46e8f0

Please sign in to comment.