Skip to content

Commit

Permalink
Merge branch 'jgfouca/cime/refactor_provenance' (PR #1436)
Browse files Browse the repository at this point in the history
Refactor provenance saving

The logic associated with the env_run.xml variables SAVE_TIMING and
SAVE_TIMING_DIR, used to decide when and where to archvie performance
data and related provenance, was inadvertently modified in the
transition between CIME2 and CIME5. Also, the CIME5.3 merge
inadvertently mixed up provenance data archiving with other
performance data collection-related actions.

The restored logic is as follows:

a) If SAVE_TIMING is TRUE, then, at the completion of a job, the
timing subdirectory in the job run directory is renamed timing.$lid,
and then tarred and gzipped (and the original timing.$lid directory is
removed). An empty file named timing.$lid.saved is created in the
timing subdirectory of the case directory to flag that this occurred.

Also, if SAVE_TIMING_DIR points to an existing directory in which a
directory named performance_archive already exists or can be created,
then the performance data and some system, job, and case
provenance data are copied to the performance_archive.

If SAVE_TIMING_DIR does not point to such a directory or contains the
string 'UNSET', then this performance data and provenance archiving
does not occur.

A system-specific default for SAVE_TIMING_DIR is typically set in
cime/config/acme/machines/config_machines.xml. If not set there, then
the default for SAVE_TIMING_DIR is 'UNSET'.

b) If SAVE_TIMING is FALSE, then the timing directory in the job run
directory is not renamed. Also SAVE_TIMING_DIR is ignored and
performance data and provenance are not archived.

Fixes #1421

[BFB]
  • Loading branch information
Patrick Worley committed May 2, 2017
2 parents b5c009e + 5836736 commit ca66db1
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 52 deletions.
2 changes: 1 addition & 1 deletion scripts/lib/CIME/code_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@ def check_code(files, num_procs=10, interactive=False):
"""
# Get list of files to check, we look to see if user-provided file argument
# is a valid file, if not, we search the repo for a file with similar name.
repo_files = run_cmd_no_fail('git ls-files --full-name %s' % get_cime_root(), verbose=False).splitlines()
files_to_check = []
if files:
repo_files = run_cmd_no_fail('git ls-files', from_dir=get_cime_root(), verbose=False).splitlines()
for filearg in files:
if os.path.exists(filearg):
files_to_check.append(os.path.abspath(filearg))
Expand Down
89 changes: 47 additions & 42 deletions scripts/lib/CIME/provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,11 @@ def _get_batch_job_id_for_syslog(case):
else:
return None

def save_build_provenance_acme(case, lid=None):
def _save_build_provenance_acme(case, lid):
cimeroot = case.get_value("CIMEROOT")
exeroot = case.get_value("EXEROOT")
caseroot = case.get_value("CASEROOT")

lid = os.environ["LID"] if lid is None else lid
# Save git describe
describe_prov = os.path.join(exeroot, "GIT_DESCRIBE.%s" % lid)
if os.path.exists(describe_prov):
Expand Down Expand Up @@ -73,8 +72,7 @@ def save_build_provenance_acme(case, lid=None):
os.remove(generic_name)
os.symlink(the_match, generic_name)


def save_build_provenance_cesm(case, lid=None): # pylint: disable=unused-argument
def _save_build_provenance_cesm(case, lid): # pylint: disable=unused-argument
version = case.get_value("MODEL_VERSION")
# version has already been recorded
caseroot = case.get_value("CASEROOT")
Expand All @@ -84,20 +82,17 @@ def save_build_provenance_cesm(case, lid=None): # pylint: disable=unused-argumen
def save_build_provenance(case, lid=None):
with SharedArea():
model = case.get_value("MODEL")
lid = os.environ["LID"] if lid is None else lid

if model == "acme":
save_build_provenance_acme(case, lid=lid)
_save_build_provenance_acme(case, lid)
elif model == "cesm":
save_build_provenance_cesm(case, lid=lid)

def save_prerun_provenance_acme(case, lid=None):
if not case.get_value("SAVE_TIMING"):
return

lid = os.environ["LID"] if lid is None else lid
_save_build_provenance_cesm(case, lid)

def _save_prerun_timing_acme(case, lid):
timing_dir = case.get_value("SAVE_TIMING_DIR")
if timing_dir is None or timing_dir == 'UNSET':
logger.warning("ACME requires SAVE_TIMING_DIR to be set in order to save timings. Skipping save timings")
if timing_dir is None or not os.path.isdir(timing_dir):
logger.warning("SAVE_TIMING_DIR '%s' is not valid. ACME requires a valid SAVE_TIMING_DIR to be set in order to archive timings. Skipping archive timings" % timing_dir)
return

logger.info("timing dir is %s" % timing_dir)
Expand Down Expand Up @@ -204,7 +199,11 @@ def save_prerun_provenance_acme(case, lid=None):
else:
run_cmd_no_fail("git describe", arg_stdout=os.path.join(full_timing_dir, "GIT_DESCRIBE.%s" % lid), from_dir=os.path.dirname(cimeroot))

def save_prerun_provenance_cesm(case, lid=None): # pylint: disable=unused-argument
def _save_prerun_provenance_acme(case, lid):
if case.get_value("SAVE_TIMING"):
_save_prerun_timing_acme(case, lid)

def _save_prerun_provenance_cesm(case, lid): # pylint: disable=unused-argument
pass

def save_prerun_provenance(case, lid=None):
Expand All @@ -219,30 +218,42 @@ def save_prerun_provenance(case, lid=None):

model = case.get_value("MODEL")
if model == "acme":
save_prerun_provenance_acme(case, lid=lid)
_save_prerun_provenance_acme(case, lid)
elif model == "cesm":
save_prerun_provenance_cesm(case, lid=lid)
_save_prerun_provenance_cesm(case, lid)

def save_postrun_provenance_cesm(case, lid=None):
def _save_postrun_provenance_cesm(case, lid):
save_timing = case.get_value("SAVE_TIMING")
if save_timing:
lid = os.environ["LID"] if lid is None else lid
rundir = case.get_value("RUNDIR")
timing_dir = case.get_value("SAVE_TIMING_DIR")
timing_dir = os.path.join(timing_dir, case.get_value("CASE"))
shutil.move(os.path.join(rundir,"timing"),
os.path.join(timing_dir,"timing."+lid))

def save_postrun_provenance_acme(case, lid):
save_timing = case.get_value("SAVE_TIMING")
if not save_timing:
return

lid = os.environ["LID"] if lid is None else lid

def _save_postrun_timing_acme(case, lid):
caseroot = case.get_value("CASEROOT")
rundir = case.get_value("RUNDIR")
timing_dir = case.get_value("SAVE_TIMING_DIR")
caseroot = case.get_value("CASEROOT")

# tar timings
rundir_timing_dir = os.path.join(rundir, "timing." + lid)
shutil.move(os.path.join(rundir, "timing"), rundir_timing_dir)
with tarfile.open("%s.tar.gz" % rundir_timing_dir, "w:gz") as tfd:
tfd.add(rundir_timing_dir, arcname=os.path.basename(rundir_timing_dir))

shutil.rmtree(rundir_timing_dir)

gzip_existing_file(os.path.join(caseroot, "timing", "acme_timing_stats.%s" % lid))

# JGF: not sure why we do this
timing_saved_file = "timing.%s.saved" % lid
touch(os.path.join(caseroot, "timing", timing_saved_file))

if timing_dir is None or not os.path.isdir(timing_dir):
logger.warning("SAVE_TIMING_DIR '%s' is not valid. ACME requires a valid SAVE_TIMING_DIR to be set in order to archive timings. Skipping archive timings" % timing_dir)
return

mach = case.get_value("MACH")
base_case = case.get_value("CASE")
full_timing_dir = os.path.join(timing_dir, "performance_archive", getpass.getuser(), base_case, lid)
Expand All @@ -261,21 +272,9 @@ def save_postrun_provenance_acme(case, lid):
finally:
os.remove(syslog_jobid_path)

# copy/tar timings
rundir_timing_dir = os.path.join(rundir, "timing." + lid)
shutil.move(os.path.join(rundir, "timing"), rundir_timing_dir)
with tarfile.open("%s.tar.gz" % rundir_timing_dir, "w:gz") as tfd:
tfd.add(rundir_timing_dir, arcname=os.path.basename(rundir_timing_dir))

shutil.rmtree(rundir_timing_dir)
# copy timings
copy_umask("%s.tar.gz" % rundir_timing_dir, full_timing_dir)

gzip_existing_file(os.path.join(caseroot, "timing", "acme_timing_stats.%s" % lid))

# JGF: not sure why we do this
timing_saved_file = "timing.%s.saved" % lid
touch(os.path.join(caseroot, "timing", timing_saved_file))

#
# save output files and logs
#
Expand Down Expand Up @@ -311,10 +310,16 @@ def save_postrun_provenance_acme(case, lid):
if not filename.endswith(".gz"):
gzip_existing_file(os.path.join(root, filename))

def _save_postrun_provenance_acme(case, lid):
if case.get_value("SAVE_TIMING"):
_save_postrun_timing_acme(case, lid)

def save_postrun_provenance(case, lid=None):
with SharedArea():
model = case.get_value("MODEL")
lid = os.environ["LID"] if lid is None else lid

if model == "acme":
save_postrun_provenance_acme(case, lid=lid)
_save_postrun_provenance_acme(case, lid)
elif model == "cesm":
save_postrun_provenance_cesm(case, lid=lid)
_save_postrun_provenance_cesm(case, lid)
9 changes: 0 additions & 9 deletions src/drivers/mct/cime_config/config_component.xml
Original file line number Diff line number Diff line change
Expand Up @@ -915,15 +915,6 @@
<desc>timer output depth</desc>
</entry>

<entry id="SAVE_TIMING_DIR">
<type>char</type>
<valid_values></valid_values>
<default_value>timing</default_value>
<group>run_flags</group>
<file>env_run.xml</file>
<desc>Where to auto archive timing data</desc>
</entry>

<entry id="PROFILE_PAPI_ENABLE">
<type>logical</type>
<valid_values>TRUE,FALSE</valid_values>
Expand Down
9 changes: 9 additions & 0 deletions src/drivers/mct/cime_config/config_component_acme.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@
<desc>logical to save timing files in rundir</desc>
</entry>

<entry id="SAVE_TIMING_DIR">
<type>char</type>
<valid_values></valid_values>
<default_value>UNSET</default_value>
<group>run_flags</group>
<file>env_run.xml</file>
<desc>Where to auto archive timing data</desc>
</entry>

<entry id="TIMER_DETAIL">
<type>integer</type>
<default_value>20</default_value>
Expand Down
9 changes: 9 additions & 0 deletions src/drivers/mct/cime_config/config_component_cesm.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@
<desc>logical to save timing files in rundir</desc>
</entry>

<entry id="SAVE_TIMING_DIR">
<type>char</type>
<valid_values></valid_values>
<default_value>timing</default_value>
<group>run_flags</group>
<file>env_run.xml</file>
<desc>Where to auto archive timing data</desc>
</entry>

<entry id="TPROF_TOTAL">
<type>integer</type>
<default_value>0</default_value>
Expand Down

0 comments on commit ca66db1

Please sign in to comment.