Skip to content
This repository has been archived by the owner on Dec 7, 2018. It is now read-only.

Rollup: New sampler API -> support for emcee #68

Merged
merged 47 commits into from
Aug 22, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
6eaa748
start changing the base sampler api
Jul 18, 2018
d41964d
start InferenceFile -> BaseInferenceFile
Jul 12, 2018
cef9e8c
rename hdf.py base_hdf.py
Jul 12, 2018
6972102
add parse_parameters function
Jul 12, 2018
7c7e615
add module for base mcmc io
Jul 12, 2018
214609a
make _read_samples_data the abstract method
Jul 13, 2018
9e10e08
added read_samples_data to base_mcmc
Jul 13, 2018
af6e7b9
add emcee file handling
Jul 13, 2018
b089dca
replace read/write functions with io in BaseSampler
Jul 13, 2018
137dc14
add checkpoint requirement; rename samples raw_samples
Jul 13, 2018
be9b8de
start updating emcee
Jul 13, 2018
f2b04f3
move emcee_pt to it's own module
Jul 13, 2018
5f9c091
add base_mcmc (needs work)
Jul 13, 2018
3d75cab
add write_metadata to models
Jul 16, 2018
f81edab
move setting up checkpoint and run interval to sampler methods
Jul 16, 2018
2f9a2b2
rearrange read/write functions; add checkpoint and finalize methods; …
Jul 16, 2018
866f39a
fix whitespace
Jul 16, 2018
5b90d77
add acl support
Jul 17, 2018
764c741
update executable
Jul 17, 2018
24a9b4f
add finalize to emcee, fix typos
Jul 17, 2018
c35a28f
change write_posterior to expect filename, not file
Jul 17, 2018
26fc718
change burn in module to just have functions
Jul 20, 2018
8d69b15
start to define burn in support class
Jul 20, 2018
8a6506a
move burn in class to burn_in module; add evaluate
Jul 30, 2018
2711460
add write burn in to io
Jul 30, 2018
59201ff
add from_config for burn-in class
Jul 30, 2018
35a8408
more support for burn-in, calculation of independent samples
Jul 30, 2018
c0eb5c6
add thin_start/interval/end to the hdf file attrs
Jul 30, 2018
eead8a8
fix typos, whitespace in burn_in
Jul 31, 2018
e765c12
fix whitespace, typos in base_hdf
Jul 31, 2018
ab40ad0
rename EnsembleMCMCIO to MCMCIO; fix whitespace
Jul 31, 2018
ac6d514
fix typo
Jul 31, 2018
23366e3
fix whitespace
Jul 31, 2018
60d0e75
write filetype to inference hdf files; provide a loadfile function
Jul 31, 2018
704d417
fix some import errors
Jul 31, 2018
adee9c3
remove sampler_class from io to avoid circular imports
Jul 31, 2018
36a5e75
fix bugs
Jul 31, 2018
e871582
fix bugs, move niterations/nsamples into config file
Aug 3, 2018
9046567
add halfchain, posterior_step, min_iterations back to burn_in
Aug 3, 2018
7254c84
fix bugs to get acl working post burn in
Aug 4, 2018
7f0952e
fix bugs in nacl burn in test
Aug 4, 2018
67e188c
write more information to the logging messages
Aug 4, 2018
a73008b
fix bugs in min_iterations burn-in test
Aug 4, 2018
f6e1d5b
fix more bugs
Aug 4, 2018
a257aed
fix pep8 issues
Aug 4, 2018
0a6f82d
fix bugs for running with data
Aug 4, 2018
370613e
whitespace
Aug 4, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
343 changes: 70 additions & 273 deletions bin/gwin

Large diffs are not rendered by default.

621 changes: 301 additions & 320 deletions gwin/burn_in.py

Large diffs are not rendered by default.

187 changes: 186 additions & 1 deletion gwin/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,190 @@
"""I/O utilities for GWIn
"""

from .hdf import InferenceFile
from __future__ import absolute_import

import os
import shutil
import logging
import h5py as _h5py

from .emcee import EmceeFile
from .txt import InferenceTXTFile

filetypes = {
EmceeFile.name: EmceeFile,
}


def loadfile(path, mode=None, filetype=None, **kwargs):
"""Loads the given file using the appropriate InferenceFile class.

If ``filetype`` is not provided, this will try to retreive the ``filetype``
from the file's ``attrs``. If the file does not exist yet, an IOError will
be raised if ``filetype`` is not provided.

Parameters
----------
path : str
The filename to load.
mode : str, optional
What mode to load the file with, e.g., 'w' for write, 'r' for read,
'a' for append. Default will default to h5py.File's mode, which is 'a'.
filetype : str, optional
Force the file to be loaded with the given class name. This must be
provided if creating a new file.

Returns
-------
filetype instance
An open file handler to the file. The class used for IO with the file
is determined by the ``filetype`` keyword (if provided) or the
``filetype`` stored in the file (if not provided).
"""
if filetype is None:
# try to read the file to get its filetype
try:
with _h5py.File(path, 'r') as fp:
filetype = fp.attrs['filetype']
except IOError:
# file doesn't exist, filetype must be provided
raise IOError("The file appears not to exist. In this case, "
"filetype must be provided.")
return filetypes[filetype](path, mode=mode, **kwargs)

#
# =============================================================================
#
# HDF Utilities
#
# =============================================================================
#


def check_integrity(filename):
"""Checks the integrity of an InferenceFile.

Checks done are:

* can the file open?
* do all of the datasets in the samples group have the same shape?
* can the first and last sample in all of the datasets in the samples
group be read?

If any of these checks fail, an IOError is raised.

Parameters
----------
filename: str
Name of an InferenceFile to check.

Raises
------
ValueError
If the given file does not exist.
KeyError
If the samples group does not exist.
IOError
If any of the checks fail.
"""
# check that the file exists
if not os.path.exists(filename):
raise ValueError("file {} does not exist".format(filename))
# if the file is corrupted such that it cannot be opened, the next line
# will raise an IOError
with loadfile(filename, 'r') as fp:
# check that all datasets in samples have the same shape
parameters = fp[fp.samples_group].keys()
group = fp.samples_group + '/{}'
# use the first parameter as a reference shape
ref_shape = fp[group.format(parameters[0])].shape
if not all(fp[group.format(param)].shape == ref_shape
for param in parameters):
raise IOError("not all datasets in the samples group have the "
"same shape")
# check that we can read the first/last sample
firstidx = tuple([0]*len(ref_shape))
lastidx = tuple([-1]*len(ref_shape))
for param in parameters:
fp[group.format(param)][firstidx]
fp[group.format(param)][lastidx]


def validate_checkpoint_files(checkpoint_file, backup_file):
"""Checks if the given checkpoint and/or backup files are valid.

The checkpoint file is considered valid if:

* it passes all tests run by ``check_integrity``;
* it has at least one sample written to it (indicating at least one
checkpoint has happened).

The same applies to the backup file. The backup file must also have the
same number of samples as the checkpoint file, otherwise, the backup is
considered invalid.

If the checkpoint (backup) file is found to be valid, but the backup
(checkpoint) file is not valid, then the checkpoint (backup) is copied to
the backup (checkpoint). Thus, this function ensures that checkpoint and
backup files are either both valid or both invalid.

Parameters
----------
checkpoint_file : string
Name of the checkpoint file.
backup_file : string
Name of the backup file.

Returns
-------
checkpoint_valid : bool
Whether or not the checkpoint (and backup) file may be used for loading
samples.
"""
# check if checkpoint file exists and is valid
try:
check_integrity(checkpoint_file)
checkpoint_valid = True
except (ValueError, KeyError, IOError):
checkpoint_valid = False
# backup file
try:
check_integrity(backup_file)
backup_valid = True
except (ValueError, KeyError, IOError):
backup_valid = False
# check if there are any samples in the file; if not, we'll just start from
# scratch
if checkpoint_valid:
with loadfile(checkpoint_file, 'r') as fp:
try:
group = '{}/{}'.format(fp.samples_group, fp.variable_params[0])
nsamples = fp[group].size
checkpoint_valid = nsamples != 0
except KeyError:
checkpoint_valid = False
# check if there are any samples in the backup file
if backup_valid:
with loadfile(backup_file, 'r') as fp:
try:
group = '{}/{}'.format(fp.samples_group, fp.variable_params[0])
backup_nsamples = fp[group].size
backup_valid = backup_nsamples != 0
except KeyError:
backup_valid = False
# check that the checkpoint and backup have the same number of samples;
# if not, assume the checkpoint has the correct number
if checkpoint_valid and backup_valid:
backup_valid = nsamples == backup_nsamples
# decide what to do based on the files' statuses
if checkpoint_valid and not backup_valid:
# copy the checkpoint to the backup
logging.info("Backup invalid; copying checkpoint file")
shutil.copy(checkpoint_file, backup_file)
backup_valid = True
elif backup_valid and not checkpoint_valid:
logging.info("Checkpoint invalid; copying backup file")
# copy the backup to the checkpoint
shutil.copy(backup_file, checkpoint_file)
checkpoint_valid = True
return checkpoint_valid
Loading