From 6eaa748aa980622439d5fb69de0eefdb0d2e3440 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Tue, 17 Jul 2018 21:50:29 -0400
Subject: [PATCH 01/47] start changing the base sampler api

---
 gwin/sampler/base.py | 885 ++++---------------------------------------
 1 file changed, 66 insertions(+), 819 deletions(-)

diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py
index 3601c5b..a7a8fad 100644
--- a/gwin/sampler/base.py
+++ b/gwin/sampler/base.py
@@ -22,10 +22,10 @@
 # =============================================================================
 #
 """
-This modules provides classes and functions for using different sampler
-packages for parameter estimation.
+Defines the base sampler class to be inherited by all samplers.
 """
 
+from abc import ABCMeta
 import numpy
 from pycbc.io import FieldArray
 from pycbc.filter import autocorrelation
@@ -41,28 +41,26 @@
 # =============================================================================
 #
 
-class _BaseSampler(object):
-    """Base container class for running the inference sampler that will
-    generate the posterior distributions.
+class BaseSampler(object):
+    """Base container class for inference samplers.
 
     Parameters
     ----------
     model : Model
         An instance of a model from ``gwin.models``.
     """
+    __metaclass__ = ABCMeta
     name = None
 
     def __init__(self, model):
         self.model = model
-        self.lastclear = 0
 
-    @classmethod
-    def from_cli(cls, opts, model, pool=None,
-                 model_call=None):
-        """This function create an instance of this sampler from the given
-        command-line options.
+    #@classmethod # uncomment when we move to python 3.3
+    @abstractmethod
+    def from_config(cls, cp, model, pool=None, model_call=None, **kwargs):
+        """This should initialize the sampler given a config file.
         """
-        raise NotImplementedError("from_cli function not set")
+        pass
 
     @property
     def variable_params(self):
@@ -72,841 +70,90 @@ def variable_params(self):
 
     @property
     def sampling_params(self):
-        """Returns the sampling args used by the model.
+        """Returns the sampling params used by the model.
         """
         return self.model.sampling_params
 
     @property
-    def chain(self):
-        """This function should return the past samples as a
-        [additional dimensions x] niterations x ndim array, where ndim are the
-        number of model params, niterations the number of iterations, and
-        additional dimeionions are any additional dimensions used by the
-        sampler (e.g, walkers, temperatures).
+    def static_params(self):
+        """Returns the model's fixed parameters.
         """
-        return NotImplementedError("chain function not set.")
+        return self.model.static_params
 
-    @property
+    @abstractproperty
     def samples(self):
-        """This function should return the past samples as a [additional
-        dimensions x] niterations field array, where the fields are union
-        of the sampling args and the model params.
-        """
-        return NotImplementedError("samples function not set.")
-
-    @property
-    def clear_chain(self):
-        """This function should clear the current chain of samples from memory.
-        """
-        return NotImplementedError("clear chain function not set.")
-
-    @property
-    def niterations(self):
-        """Get the current number of iterations."""
-        return self.chain.shape[-2] + self.lastclear
-
-    @property
-    def acceptance_fraction(self):
-        """This function should return the fraction of steps accepted by each
-        walker as an array.
+        """Should return all of the samples currently stored in memory as a
+        numpy structure array or FieldArray.
         """
-        return NotImplementedError("acceptance_fraction function not set.")
+        pass
 
-    @property
-    def lnpost(self):
-        """This function should return the natural logarithm of the likelihood
-        function used by the sampler as an
-        [additional dimensions] x niterations array.
-        """
-        return NotImplementedError("lnpost function not set.")
-
-    @property
+    @abstractproperty
     def model_stats(self):
-        """This function should return the prior and likelihood ratio of
-        samples as an [additional dimensions] x niterations
-        array. If the model did not return that info to the
-        sampler, it should return None.
+        """Should return all of the model's metadata currently stored in
+        memory as a numpy structure array or FieldArray.
         """
-        return NotImplementedError("model stats not set")
+        pass
 
-    def burn_in(self, initial_values):
-        """This function should burn in the sampler.
-        """
-        raise NotImplementedError("This sampler has no burn_in function.")
-
-    def run(self, niterations):
+    @abstractmethod
+    def run(self):
         """This function should run the sampler.
+        
+        Any checkpointing should be done internally in this function.
         """
-        raise NotImplementedError("run function not set.")
-
-    @classmethod
-    def calculate_logevidence(cls, fp):
-        """This function should calculate the log evidence and its error using
-        the results in the given file. If the sampler does not support evidence
-        calculation, then this will raise a NotImplementedError.
-        """
-        raise NotImplementedError("this sampler does not support evidence "
-                                  "calculation")
-
-    # write and read functions
-    def write_metadata(self, fp, **kwargs):
-        """Writes metadata about this sampler to the given file. Metadata is
-        written to the file's `attrs`.
-
-        Parameters
-        ----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        **kwargs :
-            All keyword arguments are saved as separate arguments in the
-            file attrs. If any keyword argument is a dictionary, the keyword
-            will point to the list of keys in the the file's ``attrs``. Each
-            key is then stored as a separate attr with its corresponding value.
-        """
-        fp.attrs['sampler'] = self.name
-        fp.attrs['model'] = self.model.name
-        fp.attrs['variable_params'] = list(self.variable_params)
-        fp.attrs['sampling_params'] = list(self.sampling_params)
-        fp.attrs["niterations"] = self.niterations
-        try:
-            fp.attrs["lognl"] = self.model.lognl
-        except AttributeError:
-            pass
-        for arg, val in kwargs.items():
-            if val is None:
-                val = str(None)
-            if isinstance(val, dict):
-                fp.attrs[arg] = val.keys()
-                for key, item in val.items():
-                    if item is None:
-                        item = str(None)
-                    fp.attrs[key] = item
-            else:
-                fp.attrs[arg] = val
-
-    @staticmethod
-    def write_logevidence(fp, lnz, dlnz):
-        """Writes the given log evidence and its error to the given file.
-        Results are saved to the file's 'log_evidence' and 'dlog_evidence'
-        attributes.
-
-        Parameters
-        ----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        lnz : float
-            The log of the evidence.
-        dlnz : float
-            The error in the estimate of the log evidence.
-        """
-        fp.attrs['log_evidence'] = lnz
-        fp.attrs['dlog_evidence'] = dlnz
-
-    @staticmethod
-    def write_burn_in_iterations(fp, burn_in_iterations, is_burned_in=None):
-        """Writes the burn in iterations to the given file.
-
-        Parameters
-        ----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        burn_in_iterations : array
-            Array of values giving the iteration of the burn in of each walker.
-        is_burned_in : array
-            Array of booleans indicating which chains are burned in.
-        """
-        try:
-            fp['burn_in_iterations'][:] = burn_in_iterations
-        except KeyError:
-            fp['burn_in_iterations'] = burn_in_iterations
-        fp.attrs['burn_in_iterations'] = burn_in_iterations.max()
-        if is_burned_in is not None:
-            try:
-                fp['is_burned_in'][:] = is_burned_in
-            except KeyError:
-                fp['is_burned_in'] = is_burned_in
-            fp.attrs['is_burned_in'] = is_burned_in.all()
-
-    @staticmethod
-    def write_state(fp):
-        """Saves the state of the sampler in a file.
-        """
-        fp.write_random_state()
-
-    @staticmethod
-    def set_state_from_file(fp):
-        """Sets the state of the sampler back to the instance saved in a file.
-        """
-        numpy.random.set_state(fp.read_random_state())
-
-
-class BaseMCMCSampler(_BaseSampler):
-    """This class is used to construct the MCMC sampler from the kombine-like
-    packages.
-
-    Parameters
-    ----------
-    sampler : sampler instance
-        An instance of an MCMC sampler similar to kombine or emcee.
-    model : model class
-        A model from ``gwin.models``.
-
-    Attributes
-    ----------
-    sampler :
-        The MCMC sampler instance used.
-    p0 : nwalkers x ndim array
-        The initial position of the walkers. Set by using set_p0. If not set
-        yet, a ValueError is raised when the attribute is accessed.
-    pos : {None, array}
-        An array of the current walker positions.
-    """
-    name = None
-
-    def __init__(self, sampler, model):
-        self._sampler = sampler
-        self._pos = None
-        self._p0 = None
-        self._currentblob = None
-        self._nwalkers = None
-        self.lastclear = 0
-        self.burn_in_iterations = None
-        # initialize
-        super(BaseMCMCSampler, self).__init__(model)
-
-    @property
-    def sampler(self):
-        return self._sampler
-
-    @property
-    def pos(self):
-        return self._pos
-
-    def set_p0(self, samples_file=None, prior=None):
-        """Sets the initial position of the walkers.
-
-        Parameters
-        ----------
-        samples_file : InferenceFile, optional
-            If provided, use the last iteration in the given file for the
-            starting positions.
-        prior : JointDistribution, optional
-            Use the given prior to set the initial positions rather than
-            ``model``'s prior.
-
-        Returns
-        -------
-        p0 : array
-            An nwalkers x ndim array of the initial positions that were set.
-        """
-        # create a (nwalker, ndim) array for initial positions
-        nwalkers = self.nwalkers
-        ndim = len(self.variable_params)
-        p0 = numpy.ones((nwalkers, ndim))
-        # if samples are given then use those as initial positions
-        if samples_file is not None:
-            samples = self.read_samples(samples_file, self.variable_params,
-                                        iteration=-1)
-            # transform to sampling parameter space
-            samples = self.model.apply_sampling_transforms(samples)
-        # draw random samples if samples are not provided
-        else:
-            samples = self.model.prior_rvs(size=nwalkers, prior=prior)
-        # convert to 2D array
-        for i, param in enumerate(self.sampling_params):
-            p0[:, i] = samples[param]
-        self._p0 = p0
-        return p0
-
-    @property
-    def p0(self):
-        if self._p0 is None:
-            raise ValueError("initial positions not set; run set_p0")
-        return self._p0
-
-    @property
-    def nwalkers(self):
-        """Get the number of walkers."""
-        return self._nwalkers
-
-    @property
-    def acceptance_fraction(self):
-        """Get the fraction of steps accepted by each walker as an array.
-        """
-        return self._sampler.acceptance_fraction
-
-    @property
-    def samples(self):
-        """Returns the samples in the chain as a FieldArray.
-
-        If the sampling args are not the same as the model params, the
-        returned samples will have both the sampling and the model params.
-
-        The returned FieldArray has dimension [additional dimensions x]
-        nwalkers x niterations.
-        """
-        # chain is a [additional dimensions x] niterations x ndim array
-        samples = self.chain
-        sampling_params = self.sampling_params
-        # convert to dictionary to apply boundary conditions
-        samples = {param: samples[..., ii] for
-                   ii, param in enumerate(sampling_params)}
-        samples = self.model.prior_distribution.apply_boundary_conditions(
-            **samples)
-        # now convert to field array
-        samples = FieldArray.from_arrays([samples[param]
-                                          for param in sampling_params],
-                                         names=sampling_params)
-        # apply transforms to go to model params space
-        if self.model.sampling_transforms is not None:
-            samples = self.model.sampling_transforms.apply(samples,
-                                                           inverse=True)
-        return samples
+        pass
 
-    @property
-    def model_stats(self):
-        """Returns the model stats as a FieldArray, with field names
-        corresponding to the type of data returned by the model.
-        The returned array has shape nwalkers x niterations. If no additional
-        stats were returned to the sampler by the model, returns
-        None.
-        """
-        stats = numpy.array(self._sampler.blobs)
-        if stats.size == 0:
-            return None
-        # we'll force arrays to float; this way, if there are `None`s in the
-        # blobs, they will be changed to `nan`s
-        arrays = {field: stats[..., fi].astype(float)
-                  for fi, field in
-                  enumerate(self.model.default_stats)}
-        return FieldArray.from_kwargs(**arrays).transpose()
+    @abstractmethod
+    def write_samples(cls, fp, samples, group="samples", **kwargs):
+        """This should write all of the provided samples to the given hdf file.
 
-    # write and read functions
-    def write_metadata(self, fp, **kwargs):
-        """Writes metadata about this sampler to the given file. Metadata is
-        written to the file's `attrs`.
+        This function should be used to write both samples and model stats.
 
         Parameters
         ----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        **kwargs :
-            All keyword args are written to the file's ``attrs``.
-        """
-        super(BaseMCMCSampler, self).write_metadata(fp, **kwargs)
-        # add info about walkers, burn in
-        fp.attrs["nwalkers"] = self.nwalkers
-
-    @staticmethod
-    def write_samples_group(fp, samples_group, parameters, samples,
-                            start_iteration=None, max_iterations=None):
-        """Writes samples to the given file.
-
-        Results are written to:
-
-            ``fp[samples_group/{vararg}]``,
-
-        where ``{vararg}`` is the name of a model params. The samples are
-        written as an ``nwalkers x niterations`` array.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        samples_group : str
-            Name of samples group to write.
-        parameters : list
-            The parameters to write to the file.
-        samples : FieldArray
-            The samples to write. Should be a FieldArray with fields containing
-            the samples to write and shape nwalkers x niterations.
-        start_iteration : int, optional
-            Write results to the file's datasets starting at the given
-            iteration. Default is to append after the last iteration in the
-            file.
-        max_iterations : int, optional
-            Set the maximum size that the arrays in the hdf file may be resized
-            to. Only applies if the samples have not previously been written
-            to file. The default (None) is to use the maximum size allowed by
-            h5py.
+        fp : open hdf file
+            The file to write to.
+        samples : structure array-like
+            Samples should be provided as a numpy structure array or a
+            FieldArray (basically, anything for which ``samples['param']`` will
+            return a numpy array).
+        group : str, optional
+            The group in ``fp`` to write the ``samples`` to. Default is
+            "samples".
+        \**kwargs :
+            Any other keyword args the sampler needs to write data.
         """
-        nwalkers, niterations = samples.shape
-        if max_iterations is not None and max_iterations < niterations:
-            raise IndexError("The provided max size is less than the "
-                             "number of iterations")
-        group = samples_group + '/{name}'
-        # loop over number of dimensions
-        for param in parameters:
-            dataset_name = group.format(name=param)
-            istart = start_iteration
-            try:
-                fp_niterations = fp[dataset_name].shape[-1]
-                if istart is None:
-                    istart = fp_niterations
-                istop = istart + niterations
-                if istop > fp_niterations:
-                    # resize the dataset
-                    fp[dataset_name].resize(istop, axis=1)
-            except KeyError:
-                # dataset doesn't exist yet
-                if istart is not None and istart != 0:
-                    raise ValueError("non-zero start_iteration provided, "
-                                     "but dataset doesn't exist yet")
-                istart = 0
-                istop = istart + niterations
-                fp.create_dataset(dataset_name, (nwalkers, istop),
-                                  maxshape=(nwalkers, max_iterations),
-                                  dtype=float, fletcher32=True)
-            fp[dataset_name][:, istart:istop] = samples[param]
-
-    def write_chain(self, fp, start_iteration=None, max_iterations=None):
-        """Writes the samples from the current chain to the given file.
-
-        Results are written to:
+        pass
 
-            `fp[fp.samples_group/{field}/(temp{k}/)walker{i}]`,
+    @abstractmethod
+    def read_samples(cls, fp, parameters, group="samples", **kwargs):
+        """This should read the requested parameters from the given hdf file.
 
-        where `{i}` is the index of a walker, `{field}` is the name of each
-        field returned by ``model_stats``, and, if the sampler is
-        multitempered, `{k}` is the temperature.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        start_iteration : int, optional
-            Write results to the file's datasets starting at the given
-            iteration. Default is to append after the last iteration in the
-            file.
-        max_iterations : int, optional
-            Set the maximum size that the arrays in the hdf file may be resized
-            to. Only applies if the samples have not previously been written
-            to file. The default (None) is to use the maximum size allowed by
-            h5py.
-        samples_group : str
-            Name of samples group to write.
-        """
-        # samples is a nwalkers x niterations field array
-        samples = self.samples
-        parameters = self.variable_params
-        samples_group = fp.samples_group
-        # write data
-        self.write_samples_group(fp, samples_group, parameters, samples,
-                                 start_iteration=start_iteration,
-                                 max_iterations=max_iterations)
-
-    def write_model_stats(self, fp, start_iteration=None,
-                          max_iterations=None):
-        """Writes the ``model_stats`` to the given file.
-
-        Results are written to:
-
-            `fp[fp.stats_group/{field}/(temp{k}/)walker{i}]`,
-
-        where `{i}` is the index of a walker, `{field}` is the name of each
-        field returned by ``model_stats``, and, if the sampler is
-        multitempered, `{k}` is the temperature.  If nothing is returned by
-        ``model_stats``, this does nothing.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        start_iteration : int, optional
-            Write results to the file's datasets starting at the given
-            iteration. Default is to append after the last iteration in the
-            file.
-        max_iterations : int, optional
-            Set the maximum size that the arrays in the hdf file may be resized
-            to. Only applies if the samples have not previously been written
-            to file. The default (None) is to use the maximum size allowed by
-            h5py.
-
-        Returns
-        -------
-        stats : {FieldArray, None}
-            The stats that were written, as a FieldArray. If there were no
-            stats, returns None.
-        """
-        samples = self.model_stats
-        if samples is None:
-            return None
-        # ensure the prior is in the model params parameter space
-        if 'logjacobian' in samples.fieldnames:
-            samples['logprior'] -= samples['logjacobian']
-        parameters = samples.fieldnames
-        samples_group = fp.stats_group
-        # write data
-        self.write_samples_group(fp, samples_group, parameters, samples,
-                                 start_iteration=start_iteration,
-                                 max_iterations=max_iterations)
-        return samples
-
-    def write_acceptance_fraction(self, fp):
-        """Write acceptance_fraction data to file. Results are written to
-        `fp[acceptance_fraction]`.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        """
-        dataset_name = "acceptance_fraction"
-        try:
-            fp[dataset_name][:] = self.acceptance_fraction
-        except KeyError:
-            # dataset doesn't exist yet, create it
-            fp[dataset_name] = self.acceptance_fraction
-
-    def write_results(self, fp, start_iteration=None,
-                      max_iterations=None, **metadata):
-        """Writes metadata, samples, model stats, and acceptance fraction
-        to the given file. Also computes and writes the autocorrleation lengths
-        of the chains. See the various write function for details.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        start_iteration : int, optional
-            Write results to the file's datasets starting at the given
-            iteration. Default is to append after the last iteration in the
-            file.
-        max_iterations : int, optional
-            Set the maximum size that the arrays in the hdf file may be resized
-            to. Only applies if the acceptance fraction has not previously been
-            written to the file. The default (None) is to use the maximum size
-            allowed by h5py.
-        \**metadata :
-            All other keyword arguments are passed to ``write_metadata``.
-        """
-        self.write_metadata(fp, **metadata)
-        self.write_chain(fp, start_iteration=start_iteration,
-                         max_iterations=max_iterations)
-        self.write_model_stats(fp, start_iteration=start_iteration,
-                               max_iterations=max_iterations)
-        self.write_acceptance_fraction(fp)
-        self.write_state(fp)
-
-    @staticmethod
-    def _read_fields(fp, fields_group, fields, array_class,
-                     thin_start=None, thin_interval=None, thin_end=None,
-                     iteration=None, walkers=None, flatten=True):
-        """Base function for reading samples and model stats. See
-        `read_samples` and `read_model_stats` for details.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
-        fields_group : str
-            The name of the group to retrieve the desired fields.
-        fields : list
-            The list of field names to retrieve. Must be names of groups in
-            `fp[fields_group/]`.
-        array_class : FieldArray or similar
-            The type of array to return. Must have a `from_kwargs` attribute.
-
-        For other details on keyword arguments, see `read_samples` and
-        `read_model_stats`.
-
-        Returns
-        -------
-        array_class
-            An instance of the given array class populated with values
-            retrieved from the fields.
-        """
-        # walkers to load
-        if walkers is not None:
-            widx = numpy.zeros(fp.nwalkers, dtype=bool)
-            widx[walkers] = True
-        else:
-            widx = slice(0, None)
-        # get the slice to use
-        if iteration is not None:
-            get_index = iteration
-        else:
-            if thin_end is None:
-                # use the number of current iterations
-                thin_end = fp.niterations
-            get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end,
-                                     thin_interval=thin_interval)
-        # load
-        arrays = {}
-        group = fields_group + '/{name}'
-        for name in fields:
-            arr = fp[group.format(name=name)][widx, get_index]
-            if flatten:
-                arr = arr.flatten()
-            arrays[name] = arr
-        return array_class.from_kwargs(**arrays)
-
-    @classmethod
-    def read_samples(cls, fp, parameters,
-                     thin_start=None, thin_interval=None, thin_end=None,
-                     iteration=None, walkers=None, flatten=True,
-                     samples_group=None, array_class=None):
-        """Reads samples for the given parameter(s).
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
-        parameters : (list of) strings
-            The parameter(s) to retrieve. A parameter can be the name of any
-            field in `fp[fp.samples_group]`, a virtual field or method of
-            `FieldArray` (as long as the file contains the necessary fields
-            to derive the virtual field or method), and/or a function of
-            these.
-        thin_start : int
-            Index of the sample to begin returning samples. Default is to read
-            samples after burn in. To start from the beginning set thin_start
-            to 0.
-        thin_interval : int
-            Interval to accept every i-th sample. Default is to use the
-            `fp.acl`. If `fp.acl` is not set, then use all samples
-            (set thin_interval to 1).
-        thin_end : int
-            Index of the last sample to read. If not given then
-            `fp.niterations` is used.
-        iteration : int
-            Get a single iteration. If provided, will override the
-            `thin_{start/interval/end}` arguments.
-        walkers : {None, (list of) int}
-            The walker index (or a list of indices) to retrieve. If None,
-            samples from all walkers will be obtained.
-        flatten : {True, bool}
-            The returned array will be one dimensional, with all desired
-            samples from all desired walkers concatenated together. If False,
-            the returned array will have dimension requested walkers
-            x requested iterations.
-        samples_group : {None, str}
-            The group in `fp` from which to retrieve the parameter fields. If
-            None, searches in `fp.samples_group`.
-        array_class : {None, array class}
-            The type of array to return. The class must have a `from_kwargs`
-            class method and a `parse_parameters` method. If None, will return
-            a FieldArray.
-
-        Returns
-        -------
-        array_class
-            Samples for the given parameters, as an instance of a the given
-            `array_class` (`FieldArray` if `array_class` is None).
-        """
-        # get the group to load from
-        if samples_group is None:
-            samples_group = fp.samples_group
-        # get the type of array class to use
-        if array_class is None:
-            array_class = FieldArray
-        # get the names of fields needed for the given parameters
-        possible_fields = fp[samples_group].keys()
-        loadfields = array_class.parse_parameters(parameters, possible_fields)
-        return cls._read_fields(fp, samples_group, loadfields, array_class,
-                                thin_start=thin_start,
-                                thin_interval=thin_interval, thin_end=thin_end,
-                                iteration=iteration, walkers=walkers,
-                                flatten=flatten)
-
-    @classmethod
-    def n_independent_samples(cls, fp):
-        """Returns the number of independent samples stored in a file.
-
-        The number of independent samples are counted starting from after
-        burn-in. If the sampler hasn't burned in yet, then 0 is returned.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            An open file handler to read.
-
-        Returns
-        -------
-        int
-            The number of independent samples.
-        """
-        # check if burned in
-        if not fp.is_burned_in:
-            return 0
-        # we'll just read a single parameter from the file
-        samples = cls.read_samples(fp, fp.variable_params[0])
-        return samples.size
-
-    @staticmethod
-    def read_acceptance_fraction(fp, walkers=None):
-        """Reads the acceptance fraction from the given file.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
-        walkers : {None, (list of) int}
-            The walker index (or a list of indices) to retrieve. If None,
-            samples from all walkers will be obtained.
-
-        Returns
-        -------
-        array
-            Array of acceptance fractions with shape (requested walkers,).
-        """
-        group = 'acceptance_fraction'
-        if walkers is None:
-            wmask = numpy.ones(fp.nwalkers, dtype=bool)
-        else:
-            wmask = numpy.zeros(fp.nwalkers, dtype=bool)
-            wmask[walkers] = True
-        return fp[group][wmask]
-
-    @classmethod
-    def compute_acfs(cls, fp, start_index=None, end_index=None,
-                     per_walker=False, walkers=None, parameters=None):
-        """Computes the autocorrleation function of the model params in the
-        given file.
-
-        By default, parameter values are averaged over all walkers at each
-        iteration. The ACF is then calculated over the averaged chain. An
-        ACF per-walker will be returned instead if ``per_walker=True``.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
-        start_index : {None, int}
-            The start index to compute the acl from. If None, will try to use
-            the number of burn-in iterations in the file; otherwise, will start
-            at the first sample.
-        end_index : {None, int}
-            The end index to compute the acl to. If None, will go to the end
-            of the current iteration.
-        per_walker : optional, bool
-            Return the ACF for each walker separately. Default is False.
-        walkers : optional, int or array
-            Calculate the ACF using only the given walkers. If None (the
-            default) all walkers will be used.
-        parameters : optional, str or array
-            Calculate the ACF for only the given parameters. If None (the
-            default) will calculate the ACF for all of the model params.
-
-        Returns
-        -------
-        FieldArray
-            A ``FieldArray`` of the ACF vs iteration for each parameter. If
-            `per-walker` is True, the FieldArray will have shape
-            ``nwalkers x niterations``.
-        """
-        acfs = {}
-        if parameters is None:
-            parameters = fp.variable_params
-        if isinstance(parameters, str) or isinstance(parameters, unicode):
-            parameters = [parameters]
-        for param in parameters:
-            if per_walker:
-                # just call myself with a single walker
-                if walkers is None:
-                    walkers = numpy.arange(fp.nwalkers)
-                arrays = [cls.compute_acfs(fp, start_index=start_index,
-                                           end_index=end_index,
-                                           per_walker=False, walkers=ii,
-                                           parameters=param)[param]
-                          for ii in walkers]
-                acfs[param] = numpy.vstack(arrays)
-            else:
-                samples = cls.read_samples(fp, param,
-                                           thin_start=start_index,
-                                           thin_interval=1, thin_end=end_index,
-                                           walkers=walkers,
-                                           flatten=False)[param]
-                samples = samples.mean(axis=0)
-                acfs[param] = autocorrelation.calculate_acf(samples).numpy()
-        return FieldArray.from_kwargs(**acfs)
-
-    @classmethod
-    def compute_acls(cls, fp, start_index=None, end_index=None):
-        """Computes the autocorrleation length for all model params in the
-        given file.
-
-        Parameter values are averaged over all walkers at each iteration.
-        The ACL is then calculated over the averaged chain. If the returned ACL
-        is `inf`,  will default to the number of current iterations.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
-        start_index : {None, int}
-            The start index to compute the acl from. If None, will try to use
-            the number of burn-in iterations in the file; otherwise, will start
-            at the first sample.
-        end_index : {None, int}
-            The end index to compute the acl to. If None, will go to the end
-            of the current iteration.
-
-        Returns
-        -------
-        dict
-            A dictionary giving the ACL for each parameter.
-        """
-        acls = {}
-        for param in fp.variable_params:
-            samples = cls.read_samples(fp, param,
-                                       thin_start=start_index,
-                                       thin_interval=1, thin_end=end_index,
-                                       flatten=False)[param]
-            samples = samples.mean(axis=0)
-            acl = autocorrelation.calculate_acl(samples)
-            if numpy.isinf(acl):
-                acl = samples.size
-            acls[param] = acl
-        return acls
-
-    @staticmethod
-    def write_acls(fp, acls):
-        """Writes the given autocorrelation lengths to the given file.
-
-        The ACL of each parameter is saved to ``fp['acls/{param}']``.
-        The maximum over all the parameters is saved to the file's 'acl'
-        attribute.
+        The samples should be returned as a ``FieldArray``.
 
         Parameters
         ----------
-        fp : InferenceFile
-            An open file handler to write the samples to.
-        acls : dict
-            A dictionary of ACLs keyed by the parameter.
-
-        Returns
-        -------
-        ACL
-            The maximum of the acls that was written to the file.
+        fp : open hdf file
+            The file to read from.
+        parameters : list of str
+            List of the parameters to return. May include functions.
+        group : str, optional
+            The group in ``fp`` to read the ``samples`` from. Default is
+            "samples".
+        \**kwargs :
+            Any other keyword args the sampler needs to read data.
         """
-        group = 'acls/{}'
-        # write the individual acls
-        for param in acls:
-            try:
-                # we need to use the write_direct function because it's
-                # apparently the only way to update scalars in h5py
-                fp[group.format(param)].write_direct(numpy.array(acls[param]))
-            except KeyError:
-                # dataset doesn't exist yet
-                fp[group.format(param)] = acls[param]
-        # write the maximum over all params
-        fp.attrs['acl'] = numpy.array(acls.values()).max()
-        return fp.attrs['acl']
+        pass
 
-    @staticmethod
-    def read_acls(fp):
-        """Reads the acls of all the parameters in the given file.
+    @abstractmethod
+    def write_posterior(cls, posterior_fp, **kwargs):
+        """This should write a posterior plus any other metadata to the given
+        file.
 
         Parameters
         ----------
-        fp : InferenceFile
-            An open file handler to read the acls from.
-
-        Returns
-        -------
-        dict
-            A dictionary of the ACLs, keyed by the parameter name.
+        posterior_fp : open hdf file
+            The file to write to.
+        \**kwargs :
+            Any other keyword args the sampler needs to write the posterior.
         """
-        group = fp['acls']
-        return {param: group[param].value for param in group.keys()}
+        pass

From d41964dbccef15130d20fe1c498f0f1e4c7259eb Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Thu, 12 Jul 2018 18:50:37 +0200
Subject: [PATCH 02/47] start InferenceFile -> BaseInferenceFile

---
 gwin/io/hdf.py | 637 +++++++++++++++++++------------------------------
 1 file changed, 247 insertions(+), 390 deletions(-)

diff --git a/gwin/io/hdf.py b/gwin/io/hdf.py
index 1799694..a89172e 100644
--- a/gwin/io/hdf.py
+++ b/gwin/io/hdf.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2016 Christopher M. Biwer
+# Copyright (C) 2016 Christopher M. Biwer, Collin Capano
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by the
 # Free Software Foundation; either version 3 of the License, or (at your
@@ -28,6 +28,7 @@
 import os
 import sys
 import logging
+from abc import ABCMeta
 
 import numpy
 
@@ -41,64 +42,7 @@
 from .. import sampler as gwin_sampler
 
 
-class _PosteriorOnlyParser(object):
-    """Provides interface for reading/writing samples from/to an InferenceFile
-    that contains flattened posterior samples.
-    """
-    @staticmethod
-    def _read_fields(fp, fields_group, fields, array_class,
-                     thin_start=None, thin_interval=None, thin_end=None,
-                     iteration=None):
-        """Reads fields from the given file.
-        """
-        if iteration is not None:
-            get_index = iteration
-        else:
-            get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end,
-                                     thin_interval=thin_interval)
-        # load
-        arrays = {}
-        group = fields_group + '/{}'
-        arrays = {field: fp[group.format(field)][get_index]
-                  for field in fields}
-        return array_class.from_kwargs(**arrays)
-
-    @classmethod
-    def read_samples(cls, fp, parameters, samples_group=None,
-                     thin_start=0, thin_end=None, thin_interval=1,
-                     iteration=None, array_class=None):
-        """Reads posterior samples from a posterior-only file.
-        """
-        # get the group to load from
-        if samples_group is None:
-            samples_group = fp.samples_group
-        # get the type of array class to use
-        if array_class is None:
-            array_class = FieldArray
-        # get the names of fields needed for the given parameters
-        possible_fields = fp[samples_group].keys()
-        loadfields = array_class.parse_parameters(parameters, possible_fields)
-        return cls._read_fields(fp, samples_group, loadfields, array_class,
-                                thin_start=thin_start,
-                                thin_interval=thin_interval, thin_end=thin_end,
-                                iteration=iteration)
-
-    @staticmethod
-    def write_samples_group(fp, samples_group, fields, samples):
-        """Writes the given samples to the given samples group.
-        """
-        for field in samples.fieldnames:
-            grp = '{}/{}'.format(samples_group, field)
-            fp[grp] = samples[field]
-
-    @classmethod
-    def n_independent_samples(cls, fp):
-        """Returns the number of independent samples stored in the file.
-        """
-        return cls.read_samples(fp, fp.variable_params[0]).size
-
-
-class InferenceFile(h5py.File):
+class BaseInferenceFile(h5py.File):
     """ A subclass of the h5py.File object that has extra functions for
     handling reading and writing the samples from the samplers.
 
@@ -109,139 +53,105 @@ class InferenceFile(h5py.File):
     mode : {None, str}
         The mode to open the file, eg. "w" for write and "r" for read.
     """
-    name = "hdf"
+    __metaclass__ = ABCMeta
+
+    name = None
     samples_group = 'samples'
-    stats_group = 'model_stats'
-    sampler_group = 'sampler_states'
+    sampler_group = 'sampler_info'
+    data_group = 'data'
+    injections_group = 'injections'
 
     def __init__(self, path, mode=None, **kwargs):
         super(InferenceFile, self).__init__(path, mode, **kwargs)
 
-    @property
-    def posterior_only(self):
-        """Whether the file only contains flattened posterior samples.
+    def __getattr__(self, attr):
+        """Things stored in ``.attrs`` are promoted to instance attributes.
+        
+        Note that properties will be called before this, so if there are any
+        properties that share the same name as something in ``.attrs``, that
+        property will get returned.
         """
-        try:
-            return self.attrs['posterior_only']
-        except KeyError:
-            return False
-
-    @property
-    def sampler_name(self):
-        """Returns the name of the sampler that was used."""
-        return self.attrs["sampler"]
-
-    @property
-    def sampler_class(self):
-        """Returns the sampler class that was used."""
-        try:
-            sampler = self.sampler_name
-        except KeyError:
-            return None
-        return gwin_sampler.samplers[sampler]
-
-    @property
-    def samples_parser(self):
-        """Returns the class to use to read/write samples from/to the file."""
-        if self.posterior_only:
-            return _PosteriorOnlyParser
-        else:
-            return self.sampler_class
-
-    @property
-    def model_name(self):
-        """Returns the name of the model that was used."""
-        return self.attrs["model"]
+        return self.attrs[attr]
 
-    @property
-    def variable_params(self):
-        """Returns list of variable_params.
+    @abstractmethod
+    def write_samples(self, samples, **kwargs):
+        """This should write all of the provided samples.
 
-        Returns
-        -------
-        variable_params : {list, str}
-            List of str that contain variable_params keys.
-        """
-        return self.attrs["variable_params"]
+        This function should be used to write both samples and model stats.
 
-    @property
-    def static_params(self):
-        """Returns a dictionary of the static_params. The keys are the argument
-        names, values are the value they were set to.
+        Parameters
+        ----------
+        fp : open hdf file
+            The file to write to.
+        samples : structure array-like
+            Samples should be provided as a numpy structure array or a
+            FieldArray (basically, anything for which ``samples['param']`` will
+            return a numpy array).
+        \**kwargs :
+            Any other keyword args the sampler needs to write data.
         """
-        return {arg: self.attrs[arg] for arg in self.attrs["static_params"]}
+        pass
 
-    @property
-    def sampling_params(self):
-        """Returns the parameters that were used to sample.
+    @abstractmethod
+    def read_samples(self, parameters, **kwargs):
+        """This should read the requested parameters.
 
-        Returns
-        -------
-        sampling_params : {list, str}
-            List of the sampling params.
-        """
-        return self.attrs["sampling_params"]
-
-    @property
-    def lognl(self):
-        """Returns the log noise likelihood."""
-        return self.attrs["lognl"]
+        The samples should be returned as a ``FieldArray``.
 
-    @property
-    def niterations(self):
-        """Returns number of iterations performed.
+        Parameters
+        ----------
+        fp : open hdf file
+            The file to read from.
+        parameters : list of str
+            List of the parameters to return. May include functions.
+        \**kwargs :
+            Any other keyword args the sampler needs to read data.
 
         Returns
         -------
-        niterations : int
-            Number of iterations performed.
+        FieldArray :
+            The samples as a FieldArray.
         """
-        return self.attrs["niterations"]
+        pass
 
-    @property
-    def n_independent_samples(self):
-        """Returns the number of independent samples stored in the file.
-        """
-        return self.samples_parser.n_independent_samples(self)
+    @abstractmethod
+    def write_posterior(self, posterior_fp, **kwargs):
+        """This should write a posterior plus any other metadata to the given
+        file.
 
-    @property
-    def burn_in_iterations(self):
-        """Returns number of iterations in the burn in.
+        Parameters
+        ----------
+        posterior_fp : open hdf file
+            The file to write to.
+        \**kwargs :
+            Any other keyword args the sampler needs to write the posterior.
         """
-        return self.attrs["burn_in_iterations"]
+        pass
 
     @property
-    def is_burned_in(self):
-        """Returns whether or not the sampler is burned in.
-        """
-        return self.attrs["is_burned_in"]
+    def sampler_class(self):
+        """Returns the sampler class that was used."""
+        try:
+            sampler = self.sampler_name
+        except KeyError:
+            return None
+        return gwin_sampler.samplers[sampler]
 
     @property
-    def nwalkers(self):
-        """Returns number of walkers used.
-
-        Returns
-        -------
-        nwalkesr : int
-            Number of walkers used.
+    def static_params(self):
+        """Returns a dictionary of the static_params. The keys are the argument
+        names, values are the value they were set to.
         """
-        return self.attrs["nwalkers"]
-
-    @property
-    def ntemps(self):
-        """Returns number of temperatures used."""
-        return self.attrs["ntemps"]
+        return {arg: self.attrs[arg] for arg in self.attrs["static_params"]}
 
     @property
-    def acl(self):
-        """ Returns the saved autocorelation length (ACL).
-
-        Returns
-        -------
-        acl : {int, float}
-            The ACL.
+    def n_independent_samples(self):
+        """Returns the number of independent samples stored in the file.
         """
-        return self.attrs["acl"]
+        try:
+            return self.attrs['n_independent_samples']
+        except KeyError:
+            return 0
 
     @property
     def cmd(self):
@@ -260,21 +170,54 @@ def cmd(self):
             cmd = cmd[-1]
         return cmd
 
-    @property
-    def resume_points(self):
-        """The iterations at which a run was resumed from checkpoint.
+    def write_metadata(self, sampler, **kwargs):
+        """Writes the sampler's metadata.
 
-        Returns
-        -------
-        resume_points : array or None
-            An array of integers giving the points at which the run resumed.
+        Parameters
+        ----------
+        sampler : gwin.sampler
+            An instance of a gwin sampler.
+        **kwargs :
+            All keyword arguments are saved as separate arguments in the
+            file attrs. If any keyword argument is a dictionary, the keyword
+            will point to the list of keys in the the file's ``attrs``. Each
+            key is then stored as a separate attr with its corresponding value.
+        """
+        self.attrs['sampler'] = samlper.name
+        self.attrs['model'] = sampler.model.name
+        self.attrs['variable_params'] = list(sampler.variable_params)
+        self.attrs['sampling_params'] = list(sampler.sampling_params)
+        # FIXME: what will write this?
+        #fp.attrs["lognl"] = self.model.lognl
+        # add the static params to the kwargs
+        kwargs['static_params'] = sampler.static_params
+        for arg, val in kwargs.items():
+            if val is None:
+                val = str(None)
+            if isinstance(val, dict):
+                self.attrs[arg] = val.keys()
+                for key, item in val.items():
+                    if item is None:
+                        item = str(None)
+                    self.attrs[key] = item
+            else:
+                self.attrs[arg] = val
+
+    def write_logevidence(self, lnz, dlnz):
+        """Writes the given log evidence and its error.
 
-        Raises
-        ------
-        KeyError
-            If the run never resumed from a checkpoint.
+        Results are saved to file's 'log_evidence' and 'dlog_evidence'
+        attributes.
+
+        Parameters
+        ----------
+        lnz : float
+            The log of the evidence.
+        dlnz : float
+            The error in the estimate of the log evidence.
         """
-        return self.attrs['resume_points']
+        self.attrs['log_evidence'] = lnz
+        self.attrs['dlog_evidence'] = dlnz
 
     @property
     def log_evidence(self):
@@ -283,115 +226,37 @@ def log_evidence(self):
         """
         return self.attrs["log_evidence"], self.attrs["dlog_evidence"]
 
-    def read_samples(self, parameters, samples_group=None, **kwargs):
-        """Reads samples from the file.
-
-        Parameters
-        -----------
-        parameters : (list of) strings
-            The parameter(s) to retrieve. A parameter can be the name of any
-            field in `samples_group`, a virtual field or method of
-            `FieldArray` (as long as the file contains the necessary fields
-            to derive the virtual field or method), and/or a function of
-            these.
-        samples_group : str
-            Group in HDF InferenceFile that parameters belong to.
-        **kwargs :
-            The rest of the keyword args are passed to the sampler's
-            `read_samples` method.
-
-        Returns
-        -------
-        FieldArray
-            Samples for the given parameters, as an instance of a
-            FieldArray.
-        """
-        # get the appropriate sampler class
-        samples_group = samples_group if samples_group else self.samples_group
-        return self.samples_parser.read_samples(self, parameters,
-                                                samples_group=samples_group,
-                                                **kwargs)
-
-    def read_model_stats(self, **kwargs):
-        """Reads model stats from self.
-
-        Parameters
-        -----------
-        **kwargs :
-            The keyword args are passed to the sampler's
-            ``read_model_stats`` method.
-
-        Returns
-        -------
-        stats : {FieldArray, None}
-            Likelihood stats in the file, as a FieldArray. The fields of the
-            array are the names of the stats that are in the ``model_stats``
-            group.
-        """
-        parameters = self[self.stats_group].keys()
-        return self.read_samples(parameters, samples_group=self.stats_group,
-                                 **kwargs)
+    def write_random_state(self, group=None, state=None):
+        """Writes the state of the random number generator from the file.
 
-    def read_acceptance_fraction(self, **kwargs):
-        """Returns the acceptance fraction that was written to the file.
+        The random state is written to ``sampler_group``/random_state.
 
         Parameters
         ----------
-        **kwargs :
-            All keyword arguments are passed to the sampler's
-            `read_acceptance_fraction` function.
-        Returns
-        -------
-        numpy.array
-            The acceptance fraction.
-        """
-        return self.sampler_class.read_acceptance_fraction(self, **kwargs)
-
-    def read_acls(self):
-        """Returns all of the individual chains' acls. See the `read_acls`
-        function of this file's sampler for more details.
-        """
-        return self.sampler_class.read_acls(self)
-
-    def read_label(self, parameter, error_on_none=False):
-        """Returns the label for the parameter.
-
-        Parameters
-        -----------
-        parameter : str
-            Name of parameter to get a label for. Will first try to retrieve
-            a label from this file's "label" attributes. If the parameter
-            is not found there, will look for a label from
-            pycbc.waveform.parameters.
-        error_on_none : {False, bool}
-            If True, will raise a ValueError if a label cannot be found, or if
-            the label is None. Otherwise, the parameter will just be returned
-            if no label can be found.
-
-        Returns
-        -------
-        label : str
-            A formatted string for the name of the paramter.
+        group : str
+            Name of group to write random state to.
+        state : tuple, optional
+            Specify the random state to write. If None, will use
+            ``numpy.random.get_state()``.
         """
-        # get label
-        try:
-            label = self[parameter].attrs["label"]
-        except KeyError:
-            # try looking in pycbc.waveform.parameters
-            try:
-                label = getattr(wfparams, parameter).label
-            except AttributeError:
-                label = None
-        if label is None:
-            if error_on_none:
-                raise ValueError("Cannot find a label for paramter %s" % (
-                    parameter))
-            else:
-                return parameter
-        return label
+        group = self.sampler_group if group is None else group
+        dataset_name = "/".join([group, "random_state"])
+        if state is None:
+            state = numpy.random.get_state()
+        s, arr, pos, has_gauss, cached_gauss = state
+        if group in self:
+            self[dataset_name][:] = arr
+        else:
+            self.create_dataset(dataset_name, arr.shape, fletcher32=True,
+                                dtype=arr.dtype)
+            self[dataset_name][:] = arr
+        self[dataset_name].attrs["s"] = s
+        self[dataset_name].attrs["pos"] = pos
+        self[dataset_name].attrs["has_gauss"] = has_gauss
+        self[dataset_name].attrs["cached_gauss"] = cached_gauss
 
     def read_random_state(self, group=None):
-        """ Reads the state of the random number generator from the file.
+        """Reads the state of the random number generator from the file.
 
         Parameters
         ----------
@@ -412,6 +277,11 @@ def read_random_state(self, group=None):
         cached_gauss = self[dataset_name].attrs["cached_gauss"]
         return s, arr, pos, has_gauss, cached_gauss
 
+    def load_random_state(self):
+        """Sets numpy's random state using what is saved in the file.
+        """
+        numpy.random.set_state(self.read_random_state())
+
     def write_strain(self, strain_dict, group=None):
         """Writes strain for each IFO to file.
 
@@ -423,7 +293,7 @@ def write_strain(self, strain_dict, group=None):
             The group to write the strain to. If None, will write to the top
             level.
         """
-        subgroup = "{ifo}/strain"
+        subgroup = self.data_group + "/{ifo}/strain"
         if group is None:
             group = subgroup
         else:
@@ -445,7 +315,7 @@ def write_stilde(self, stilde_dict, group=None):
             The group to write the strain to. If None, will write to the top
             level.
         """
-        subgroup = "{ifo}/stilde"
+        subgroup = self.data_group + "/{ifo}/stilde"
         if group is None:
             group = subgroup
         else:
@@ -469,7 +339,7 @@ def write_psd(self, psds, low_frequency_cutoff, group=None):
             The group to write the strain to. If None, will write to the top
             level.
         """
-        subgroup = "{ifo}/psds/0"
+        subgroup = self.data_group + "/{ifo}/psds/0"
         if group is None:
             group = subgroup
         else:
@@ -522,24 +392,19 @@ def write_data(self, strain_dict=None, stilde_dict=None,
         if strain_dict is not None:
             self.write_strain(strain_dict, group=group)
 
-    def write_injections(self, injection_file, ifo):
-        """ Writes injection parameters for an IFO to file.
+    def write_injections(self, injection_file):
+        """Writes injection parameters from the given injection file.
+
+        Everything in the injection file is copied to ``injections_group``.
 
         Parameters
         ----------
         injection_file : str
             Path to HDF injection file.
-        ifo : str
-            IFO name.
         """
-        subgroup = "{ifo}/injections"
-        self.create_group(subgroup.format(ifo=ifo))
         try:
             with h5py.File(injection_file, "r") as fp:
-                for param in fp.keys():
-                    self[subgroup.format(ifo=ifo)][param] = fp[param][:]
-                for key in fp.attrs.keys():
-                    self[subgroup.format(ifo=ifo)].attrs[key] = fp.attrs[key]
+                super(BaseInferenceFile, self).copy(fp, self.injections_group)
         except IOError:
             logging.warn("Could not read %s as an HDF file", injection_file)
 
@@ -563,47 +428,6 @@ def write_command_line(self):
             previous = []
         self.attrs["cmd"] = cmd + previous
 
-    def write_resume_point(self):
-        """Keeps a list of the number of iterations that were in a file when a
-        run was resumed from a checkpoint."""
-        try:
-            resume_pts = self.attrs["resume_points"].tolist()
-        except KeyError:
-            resume_pts = []
-        try:
-            niterations = self.niterations
-        except KeyError:
-            niterations = 0
-        resume_pts.append(niterations)
-        self.attrs["resume_points"] = resume_pts
-
-    def write_random_state(self, group=None, state=None):
-        """ Writes the state of the random number generator from the file.
-
-        Parameters
-        ----------
-        group : str
-            Name of group to read random state to.
-        state : tuple, optional
-            Specify the random state to write. If None, will use
-            ``numpy.random.get_state()``.
-        """
-        group = self.sampler_group if group is None else group
-        dataset_name = "/".join([group, "random_state"])
-        if state is None:
-            state = numpy.random.get_state()
-        s, arr, pos, has_gauss, cached_gauss = state
-        if group in self:
-            self[dataset_name][:] = arr
-        else:
-            self.create_dataset(dataset_name, arr.shape, fletcher32=True,
-                                dtype=arr.dtype)
-            self[dataset_name][:] = arr
-        self[dataset_name].attrs["s"] = s
-        self[dataset_name].attrs["pos"] = pos
-        self[dataset_name].attrs["has_gauss"] = has_gauss
-        self[dataset_name].attrs["cached_gauss"] = cached_gauss
-
     def get_slice(self, thin_start=None, thin_interval=None, thin_end=None):
         """Formats a slice using the given arguments that can be used to
         retrieve a thinned array from an InferenceFile.
@@ -651,8 +475,7 @@ def get_slice(self, thin_start=None, thin_interval=None, thin_end=None):
     def copy_metadata(self, other):
         """Copies all metadata from this file to the other file.
 
-        Metadata is defined as all data that is not in either the samples or
-        stats group.
+        Metadata is defined as everything in the top-level ``.attrs``.
 
         Parameters
         ----------
@@ -660,70 +483,60 @@ def copy_metadata(self, other):
             An open inference file to write the data to.
         """
         logging.info("Copying metadata")
-        # copy non-samples/stats data
-        for key in self.keys():
-            if key not in [self.samples_group, self.stats_group]:
-                super(InferenceFile, self).copy(key, other)
         # copy attributes
         for key in self.attrs.keys():
             other.attrs[key] = self.attrs[key]
 
-    def copy(self, other, parameters=None, parameter_names=None,
-             posterior_only=False, **kwargs):
-        """Copies data in this file to another file.
+    def copy_info(self, other, ignore=None):
+        """Copies "info" from this file to the other.
 
-        The samples and stats to copy may be down selected using the given
-        kwargs. All other data (the "metadata") are copied exactly.
+        "Info" is defined all groups that are not the samples group.
 
         Parameters
         ----------
-        other : str or InferenceFile
-            The file to write to. May be either a string giving a filename,
-            or an open hdf file. If the former, the file will be opened with
-            the write attribute (note that if a file already exists with that
-            name, it will be deleted).
+        other : output file
+            The output file. Must be an hdf file.
+        ignore : (list of) str
+            Don't copy the given groups.
+        """
+        logging.info("Copying info")
+        # copy non-samples/stats data
+        if ignore is None:
+            ignore = []
+        if isinstance(ignore, (str, unicode)):
+            ignore = [ignore]
+        ignore = set(ignore + [self.samples_group])
+        copy_groups = set(self.keys()) - ignore
+        for key in copy_groups:
+            super(BaseInferenceFile, self).copy(key, other)
+
+    def copy_samples(self, other, parameters=None, parameter_names=None,
+                     read_args=None, write_args=None):
+        """Should copy samples to the other files.
+
+        Parameters
+        ----------
+        other : InferenceFile
+            An open inference file to write to.
         parameters : list of str, optional
             List of parameters to copy. If None, will copy all parameters.
         parameter_names : dict, optional
             Rename one or more parameters to the given name. The dictionary
             should map parameter -> parameter name. If None, will just use the
             original parameter names.
-        posterior_only : bool, optional
-            Write the samples and model stats as flattened arrays, and
-            set other's posterior_only attribute. For example, if this file
-            has a parameter's samples written to
-            `{samples_group}/{param}/walker{x}`, then other will have all of
-            the selected samples from all walkers written to
-            `{samples_group}/{param}/`.
-        **kwargs :
-            All other keyword arguments are passed to `read_samples`.
-
-        Returns
-        -------
-        InferenceFile
-            The open file handler to other.
+        read_args : dict, optional
+            Arguments to pass to ``read_samples``.
+        write_args : dict, optional
+            Arguments to pass to ``write_samples``.
         """
-        if not isinstance(other, h5py.File):
-            # check that we're not trying to overwrite this file
-            if other == self.name:
-                raise IOError("destination is the same as this file")
-            other = InferenceFile(other, 'w')
-        # copy metadata over
-        self.copy_metadata(other)
-        # update other's posterior attribute
-        if posterior_only:
-            other.attrs['posterior_only'] = posterior_only
         # select the samples to copy
         logging.info("Reading samples to copy")
         if parameters is None:
             parameters = self.variable_params
-        # if list of desired parameters is different, rename model params
+        # if list of desired parameters is different, rename
         if set(parameters) != set(self.variable_params):
             other.attrs['variable_params'] = parameters
-        # if only the posterior is desired, we'll flatten the results
-        if not posterior_only and not self.posterior_only:
-            kwargs['flatten'] = False
-        samples = self.read_samples(parameters, **kwargs)
+        samples = self.read_samples(parameters, **read_args)
         logging.info("Copying {} samples".format(samples.size))
         # if different parameter names are desired, get them from the samples
         if parameter_names:
@@ -733,23 +546,67 @@ def copy(self, other, parameters=None, parameter_names=None,
             samples = FieldArray.from_kwargs(**arrs)
             other.attrs['variable_params'] = samples.fieldnames
         logging.info("Writing samples")
-        other.samples_parser.write_samples_group(other, self.samples_group,
-                                                 samples.fieldnames, samples)
-        # do the same for the model stats
-        logging.info("Reading stats to copy")
-        stats = self.read_model_stats(**kwargs)
-        logging.info("Writing stats")
-        other.samples_parser.write_samples_group(other, self.stats_group,
-                                                 stats.fieldnames, stats)
+        other.write_samples(other, samples, **write_args)
+
+    def copy(self, other, ignore=None, parameters=None, parameter_names=None,
+             read_args=None, write_args=None):
+        """Copies metadata, info, and samples in this file to another file.
+
+        Parameters
+        ----------
+        other : str or InferenceFile
+            The file to write to. May be either a string giving a filename,
+            or an open hdf file. If the former, the file will be opened with
+            the write attribute (note that if a file already exists with that
+            name, it will be deleted).
+        ignore : (list of) strings
+            Don't copy the given groups. If the samples group is included, no
+            samples will be copied.
+        parameters : list of str, optional
+            List of parameters in the samples group to copy. If None, will copy
+            all parameters.
+        parameter_names : dict, optional
+            Rename one or more parameters to the given name. The dictionary
+            should map parameter -> parameter name. If None, will just use the
+            original parameter names.
+        read_args : dict, optional
+            Arguments to pass to ``read_samples``.
+        write_args : dict, optional
+            Arguments to pass to ``write_samples``.
+
+        Returns
+        -------
+        InferenceFile
+            The open file handler to other.
+        """
+        if not isinstance(other, h5py.File):
+            # check that we're not trying to overwrite this file
+            if other == self.name:
+                raise IOError("destination is the same as this file")
+            other = InferenceFile(other, 'w')
+        # metadata
+        self.copy_metadata(other)
+        # info
+        if ignore is None:
+            ignore = []
+        if isinstance(ignore, (str, unicode)):
+            ignore = [ignore]
+        self.copy_info(other, ignore=ignore)
+        # samples
+        if self.samples_group not in ignore:
+            self.copy_samples(other, parameters=parameters,
+                              parameter_names=parameter_names,
+                              read_args=read_args,
+                              write_args=write_args)
         # if any down selection was done, re-set the burn in iterations and
         # the acl, and the niterations.
         # The last dimension of the samples returned by the sampler should
         # be the number of iterations.
-        if samples.shape[-1] != self.niterations:
-            other.attrs['acl'] = 1
-            other.attrs['burn_in_iterations'] = 0
-            other.attrs['niterations'] = samples.shape[-1]
-        return other
+        #if samples.shape[-1] != self.niterations:
+        #    other.attrs['acl'] = 1
+        #    other.attrs['burn_in_iterations'] = 0
+        #    other.attrs['niterations'] = samples.shape[-1]
+        #return other
 
 
 def check_integrity(filename):

From cef9e8cb917befbf3f52238357b1851c5a0e39eb Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Thu, 12 Jul 2018 18:51:28 +0200
Subject: [PATCH 03/47] rename hdf.py base_hdf.py

---
 gwin/io/{hdf.py => base_hdf.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename gwin/io/{hdf.py => base_hdf.py} (100%)

diff --git a/gwin/io/hdf.py b/gwin/io/base_hdf.py
similarity index 100%
rename from gwin/io/hdf.py
rename to gwin/io/base_hdf.py

From 69721023b254702805c420fc4671e1e8899a7339 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Thu, 12 Jul 2018 19:22:36 +0200
Subject: [PATCH 04/47] add parse_parameters function

---
 gwin/io/base_hdf.py | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index a89172e..d92d3f1 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -41,7 +41,6 @@
 
 from .. import sampler as gwin_sampler
 
-
 class BaseInferenceFile(h5py.File):
     """ A subclass of the h5py.File object that has extra functions for
     handling reading and writing the samples from the samplers.
@@ -92,6 +91,43 @@ def write_samples(self, samples, **kwargs):
         """
         pass
 
+    def parse_parameters(self, parameters, array_class=None):
+        """Parses a parameters arg to figure out what fields need to be loaded.
+
+        Parameters
+        ----------
+        parameters : (list of) strings
+            The parameter(s) to retrieve. A parameter can be the name of any
+            field in ``samples_group``, a virtual field or method of
+            ``FieldArray`` (as long as the file contains the necessary fields
+            to derive the virtual field or method), and/or a function of
+            these.
+        array_class : array class, optional
+            The type of array to use to parse the parameters. The class must have a
+            ``parse_parameters`` method. Default is to use a ``FieldArray``.
+
+        Returns
+        -------
+        list :
+            A list of strings giving the fields to load from the file.
+        """
+        # get the type of array class to use
+        if array_class is None:
+            array_class = FieldArray
+        # get the names of fields needed for the given parameters
+        possible_fields = self[self.samples_group].keys()
+        return array_class.parse_parameters(parameters, possible_fields)
+
+    def _parse_parameters(self, parameters, **kwargs):
+        """Decorator function for read samples that calls parse parameters.
+        """
+        array_class = kwargs.pop('array_class', None)
+        def dostuff(parameters, **kwargs):
+            parameters = self.parse_parameters(parameters, array_class)
+            return self.read_samples(parameters, **kwargs)
+        return dostuff
+
+    @_parse_parameters
     @abstractmethod
     def read_samples(self, parameters, **kwargs):
         """This should read the requested parameters.

From 7c7e6153838bffc3363fab11d520394c0158f8fe Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Thu, 12 Jul 2018 19:22:54 +0200
Subject: [PATCH 05/47] add module for base mcmc io

---
 gwin/io/base_mcmc.py | 254 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 254 insertions(+)
 create mode 100644 gwin/io/base_mcmc.py

diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py
new file mode 100644
index 0000000..545e8e0
--- /dev/null
+++ b/gwin/io/base_mcmc.py
@@ -0,0 +1,254 @@
+# Copyright (C) 2016 Christopher M. Biwer, Collin Capano
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# self.option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+
+#
+# =============================================================================
+#
+#                                   Preamble
+#
+# =============================================================================
+#
+"""Provides I/O that is specific to MCMC samplers.
+"""
+
+import os
+import sys
+import logging
+from abc import ABCMeta
+
+import numpy
+
+import h5py
+
+from pycbc import DYN_RANGE_FAC
+from pycbc.io import FieldArray
+from pycbc.types import FrequencySeries
+from pycbc.waveform import parameters as wfparams
+
+from .hdf import InferenceFile
+
+class EnsembleMCMCIO(obect):
+
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def read_acls(self):
+        """Should return all of the individual chains' acls.
+        """
+        pass
+
+    def write_mcmc_metadata(self, sampler):
+        """Writes metadata unique to an ensemble MCMC.
+
+        Parameters
+        ----------
+        sampler : gwin.sampler
+            An instance of a gwin sampler.
+        """
+        self.attrs["niterations"] = sampler.niterations
+        self.attrs["nwalkers"] = sampler.nwalkers
+
+    def write_samples(self, parameters, samples,
+                      start_iteration=None, max_iterations=None):
+        """Writes samples to the given file.
+
+        Results are written to:
+
+            ``fp[samples_group/{vararg}]``,
+
+        where ``{vararg}`` is the name of a model params. The samples are
+        written as an ``nwalkers x niterations`` array.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            A file handler to an open inference file.
+        samples_group : str
+            Name of samples group to write.
+        parameters : list
+            The parameters to write to the file.
+        samples : FieldArray
+            The samples to write. Should be a FieldArray with fields containing
+            the samples to write and shape nwalkers x niterations.
+        start_iteration : int, optional
+            Write results to the file's datasets starting at the given
+            iteration. Default is to append after the last iteration in the
+            file.
+        max_iterations : int, optional
+            Set the maximum size that the arrays in the hdf file may be resized
+            to. Only applies if the samples have not previously been written
+            to file. The default (None) is to use the maximum size allowed by
+            h5py.
+        """
+        nwalkers, niterations = samples.shape
+        if max_iterations is not None and max_iterations < niterations:
+            raise IndexError("The provided max size is less than the "
+                             "number of iterations")
+        group = samples_group + '/{name}'
+        # loop over number of dimensions
+        for param in parameters:
+            dataset_name = group.format(name=param)
+            istart = start_iteration
+            try:
+                fp_niterations = fp[dataset_name].shape[-1]
+                if istart is None:
+                    istart = fp_niterations
+                istop = istart + niterations
+                if istop > fp_niterations:
+                    # resize the dataset
+                    fp[dataset_name].resize(istop, axis=1)
+            except KeyError:
+                # dataset doesn't exist yet
+                if istart is not None and istart != 0:
+                    raise ValueError("non-zero start_iteration provided, "
+                                     "but dataset doesn't exist yet")
+                istart = 0
+                istop = istart + niterations
+                fp.create_dataset(dataset_name, (nwalkers, istop),
+                                  maxshape=(nwalkers, max_iterations),
+                                  dtype=float, fletcher32=True)
+            fp[dataset_name][:, istart:istop] = samples[param]
+
+    def read_samples(self, parameters,
+                     thin_start=None, thin_interval=None, thin_end=None,
+                     iteration=None, walkers=None, flatten=True,
+                     array_class=None):
+        """Reads samples for the given parameter(s).
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            An open file handler to read the samples from.
+        parameters : (list of) strings
+            The parameter(s) to retrieve. A parameter can be the name of any
+            field in `fp[fp.samples_group]`, a virtual field or method of
+            `FieldArray` (as long as the file contains the necessary fields
+            to derive the virtual field or method), and/or a function of
+            these.
+        thin_start : int
+            Index of the sample to begin returning samples. Default is to read
+            samples after burn in. To start from the beginning set thin_start
+            to 0.
+        thin_interval : int
+            Interval to accept every i-th sample. Default is to use the
+            `fp.acl`. If `fp.acl` is not set, then use all samples
+            (set thin_interval to 1).
+        thin_end : int
+            Index of the last sample to read. If not given then
+            `fp.niterations` is used.
+        iteration : int
+            Get a single iteration. If provided, will override the
+            `thin_{start/interval/end}` arguments.
+        walkers : {None, (list of) int}
+            The walker index (or a list of indices) to retrieve. If None,
+            samples from all walkers will be obtained.
+        flatten : {True, bool}
+            The returned array will be one dimensional, with all desired
+            samples from all desired walkers concatenated together. If False,
+            the returned array will have dimension requested walkers
+            x requested iterations.
+        samples_group : {None, str}
+            The group in `fp` from which to retrieve the parameter fields. If
+            None, searches in `fp.samples_group`.
+        array_class : {None, array class}
+            The type of array to return. The class must have a `from_kwargs`
+            class method and a `parse_parameters` method. If None, will return
+            a FieldArray.
+
+        Returns
+        -------
+        array_class
+            Samples for the given parameters, as an instance of a the given
+            `array_class` (`FieldArray` if `array_class` is None).
+        """
+        # get the type of array class to use
+        if array_class is None:
+            array_class = FieldArray
+        # get the names of fields needed for the given parameters
+        possible_fields = self[self.samples_group].keys()
+        loadfields = array_class.parse_parameters(parameters, possible_fields)
+        return self._read_fields(loadfields, array_class,
+                                thin_start=thin_start,
+                                thin_interval=thin_interval, thin_end=thin_end,
+                                iteration=iteration, walkers=walkers,
+                                flatten=flatten)
+
+    def _read_fields(self, fields, array_class,
+                     thin_start=None, thin_interval=None, thin_end=None,
+                     iteration=None, walkers=None, flatten=True):
+        """Base function for reading samples and model stats. See
+        `read_samples` and `read_model_stats` for details.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            An open file handler to read the samples from.
+        fields_group : str
+            The name of the group to retrieve the desired fields.
+        fields : list
+            The list of field names to retrieve. Must be names of groups in
+            `fp[fields_group/]`.
+        array_class : FieldArray or similar
+            The type of array to return. Must have a `from_kwargs` attribute.
+
+        For other details on keyword arguments, see `read_samples` and
+        `read_model_stats`.
+
+        Returns
+        -------
+        array_class
+            An instance of the given array class populated with values
+            retrieved from the fields.
+        """
+        # walkers to load
+        if walkers is not None:
+            widx = numpy.zeros(fp.nwalkers, dtype=bool)
+            widx[walkers] = True
+        else:
+            widx = slice(0, None)
+        # get the slice to use
+        if iteration is not None:
+            get_index = iteration
+        else:
+            if thin_end is None:
+                # use the number of current iterations
+                thin_end = fp.niterations
+            get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end,
+                                     thin_interval=thin_interval)
+        # load
+        arrays = {}
+        group = fields_group + '/{name}'
+        for name in fields:
+            arr = fp[group.format(name=name)][widx, get_index]
+            if flatten:
+                arr = arr.flatten()
+            arrays[name] = arr
+        return array_class.from_kwargs(**arrays)
+
+    def write_resume_point(self):
+        """Keeps a list of the number of iterations that were in a file when a
+        run was resumed from a checkpoint."""
+        try:
+            resume_pts = self.attrs["resume_points"].tolist()
+        except KeyError:
+            resume_pts = []
+        try:
+            niterations = self.niterations
+        except KeyError:
+            niterations = 0
+        resume_pts.append(niterations)
+        self.attrs["resume_points"] = resume_pts
+

From 214609a1c34329b53013644a23c4411bc13dad05 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Fri, 13 Jul 2018 10:44:20 +0200
Subject: [PATCH 06/47] make _read_samples_data the abstract method

---
 gwin/io/base_hdf.py | 77 +++++++++++++++++++++++++++++----------------
 1 file changed, 50 insertions(+), 27 deletions(-)

diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index d92d3f1..6bd7864 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -28,7 +28,7 @@
 import os
 import sys
 import logging
-from abc import ABCMeta
+from abc import ABCMeta, abstractmethod, abstractproperty
 
 import numpy
 
@@ -42,7 +42,9 @@
 from .. import sampler as gwin_sampler
 
 class BaseInferenceFile(h5py.File):
-    """ A subclass of the h5py.File object that has extra functions for
+    """Base class for all inference hdf files.
+    
+    This is a subclass of the h5py.File object. It adds functions for
     handling reading and writing the samples from the samplers.
 
     Parameters
@@ -61,7 +63,7 @@ class BaseInferenceFile(h5py.File):
     injections_group = 'injections'
 
     def __init__(self, path, mode=None, **kwargs):
-        super(InferenceFile, self).__init__(path, mode, **kwargs)
+        super(BaseInferenceFile, self).__init__(path, mode, **kwargs)
 
     def __getattr__(self, attr):
         """Things stored in ``.attrs`` are promoted to instance attributes.
@@ -82,10 +84,8 @@ def write_samples(self, samples, **kwargs):
         ----------
         fp : open hdf file
             The file to write to.
-        samples : structure array-like
-            Samples should be provided as a numpy structure array or a
-            FieldArray (basically, anything for which ``samples['param']`` will
-            return a numpy array).
+        samples : dict
+            Samples should be provided as a dictionary of numpy arrays.
         \**kwargs :
             Any other keyword args the sampler needs to write data.
         """
@@ -118,35 +118,58 @@ def parse_parameters(self, parameters, array_class=None):
         possible_fields = self[self.samples_group].keys()
         return array_class.parse_parameters(parameters, possible_fields)
 
-    def _parse_parameters(self, parameters, **kwargs):
-        """Decorator function for read samples that calls parse parameters.
-        """
-        array_class = kwargs.pop('array_class', None)
-        def dostuff(parameters, **kwargs):
-            parameters = self.parse_parameters(parameters, array_class)
-            return self.read_samples(parameters, **kwargs)
-        return dostuff
+    def read_samples(self, parameters, array_class=None, **kwargs):
+        """Reads samples for the given parameter(s).
 
-    @_parse_parameters
-    @abstractmethod
-    def read_samples(self, parameters, **kwargs):
-        """This should read the requested parameters.
+        The ``parameters`` can be the name of any dataset in ``samples_group``,
+        a virtual field or method of ``FieldArray`` (as long as the file
+        contains the necessary fields to derive the virtual field or method),
+        and/or any numpy function of these.
 
-        The samples should be returned as a ``FieldArray``.
+        The ``parameters`` are parsed to figure out what datasets are needed.
+        Only those datasets will be loaded, and will be the base-level fields
+        of the returned ``FieldArray``.
+
+        The ``static_params`` are also added as attributes of the returned
+        ``FieldArray``.
 
         Parameters
-        ----------
-        fp : open hdf file
-            The file to read from.
-        parameters : list of str
-            List of the parameters to return. May include functions.
+        -----------
+        fp : InferenceFile
+            An open file handler to read the samples from.
+        parameters : (list of) strings
+            The parameter(s) to retrieve.
+        array_class : FieldArray-like class, optional
+            The type of array to return. The class must have ``from_kwargs``
+            and ``parse_parameters`` methods. If None, will return a
+            ``FieldArray``.
         \**kwargs :
-            Any other keyword args the sampler needs to read data.
+            All other keyword arguments are passed to ``_read_samples_data``.
 
         Returns
         -------
         FieldArray :
-            The samples as a FieldArray.
+            The samples as a ``FieldArray``.
+        """
+        # get the type of array class to use
+        if array_class is None:
+            array_class = FieldArray
+        # get the names of fields needed for the given parameters
+        possible_fields = self[self.samples_group].keys()
+        loadfields = array_class.parse_parameters(parameters, possible_fields)
+        samples = self._read_samples_data(loadfields, **kwargs)
+        # convert to FieldArray
+        samples = array_class.from_kwargs(**samples)
+        # add the static params
+        for p,val in self.static_params.items():
+            setattr(samples, p, val)
+        return samples
+
+    @abstractmethod
+    def _read_samples_data(self, fields, **kwargs):
+        """Low level function for reading datasets in the samples group.
+
+        This should return a dictionary of numpy arrays.
         """
         pass
 

From 9e10e08e14d2c78766d929ab8e6e60b978e97cd2 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Fri, 13 Jul 2018 10:45:34 +0200
Subject: [PATCH 07/47] added read_samples_data to base_mcmc

---
 gwin/io/base_mcmc.py | 115 ++++++++-----------------------------------
 1 file changed, 20 insertions(+), 95 deletions(-)

diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py
index 545e8e0..a597c6d 100644
--- a/gwin/io/base_mcmc.py
+++ b/gwin/io/base_mcmc.py
@@ -41,7 +41,8 @@
 from .hdf import InferenceFile
 
 class EnsembleMCMCIO(obect):
-
+    """Abstract base class that provides some IO functions for ensemble MCMCs.
+    """
     __metaclass__ = ABCMeta
 
     @abstractmethod
@@ -74,15 +75,11 @@ def write_samples(self, parameters, samples,
 
         Parameters
         -----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        samples_group : str
-            Name of samples group to write.
         parameters : list
             The parameters to write to the file.
-        samples : FieldArray
-            The samples to write. Should be a FieldArray with fields containing
-            the samples to write and shape nwalkers x niterations.
+        samples : dict
+            The samples to write. Each array in the dictionary should have
+            shape nwalkers x niterations.
         start_iteration : int, optional
             Write results to the file's datasets starting at the given
             iteration. Default is to append after the last iteration in the
@@ -93,11 +90,14 @@ def write_samples(self, parameters, samples,
             to file. The default (None) is to use the maximum size allowed by
             h5py.
         """
-        nwalkers, niterations = samples.shape
+        nwalkers, niterations = samples.values()[0].shape
+        assert(all(p.shape == (nwalkers, niterations)
+                   for p in samples.values()),
+               "all samples must have the same shape")
         if max_iterations is not None and max_iterations < niterations:
             raise IndexError("The provided max size is less than the "
                              "number of iterations")
-        group = samples_group + '/{name}'
+        group = self.samples_group + '/{name}'
         # loop over number of dimensions
         for param in parameters:
             dataset_name = group.format(name=param)
@@ -122,96 +122,21 @@ def write_samples(self, parameters, samples,
                                   dtype=float, fletcher32=True)
             fp[dataset_name][:, istart:istop] = samples[param]
 
-    def read_samples(self, parameters,
-                     thin_start=None, thin_interval=None, thin_end=None,
-                     iteration=None, walkers=None, flatten=True,
-                     array_class=None):
-        """Reads samples for the given parameter(s).
+    def _read_samples_data(self, fields,
+                           thin_start=None, thin_interval=None, thin_end=None,
+                           iteration=None, walkers=None, flatten=True):
+        """Base function for reading samples.
 
         Parameters
         -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
-        parameters : (list of) strings
-            The parameter(s) to retrieve. A parameter can be the name of any
-            field in `fp[fp.samples_group]`, a virtual field or method of
-            `FieldArray` (as long as the file contains the necessary fields
-            to derive the virtual field or method), and/or a function of
-            these.
-        thin_start : int
-            Index of the sample to begin returning samples. Default is to read
-            samples after burn in. To start from the beginning set thin_start
-            to 0.
-        thin_interval : int
-            Interval to accept every i-th sample. Default is to use the
-            `fp.acl`. If `fp.acl` is not set, then use all samples
-            (set thin_interval to 1).
-        thin_end : int
-            Index of the last sample to read. If not given then
-            `fp.niterations` is used.
-        iteration : int
-            Get a single iteration. If provided, will override the
-            `thin_{start/interval/end}` arguments.
-        walkers : {None, (list of) int}
-            The walker index (or a list of indices) to retrieve. If None,
-            samples from all walkers will be obtained.
-        flatten : {True, bool}
-            The returned array will be one dimensional, with all desired
-            samples from all desired walkers concatenated together. If False,
-            the returned array will have dimension requested walkers
-            x requested iterations.
-        samples_group : {None, str}
-            The group in `fp` from which to retrieve the parameter fields. If
-            None, searches in `fp.samples_group`.
-        array_class : {None, array class}
-            The type of array to return. The class must have a `from_kwargs`
-            class method and a `parse_parameters` method. If None, will return
-            a FieldArray.
-
-        Returns
-        -------
-        array_class
-            Samples for the given parameters, as an instance of a the given
-            `array_class` (`FieldArray` if `array_class` is None).
-        """
-        # get the type of array class to use
-        if array_class is None:
-            array_class = FieldArray
-        # get the names of fields needed for the given parameters
-        possible_fields = self[self.samples_group].keys()
-        loadfields = array_class.parse_parameters(parameters, possible_fields)
-        return self._read_fields(loadfields, array_class,
-                                thin_start=thin_start,
-                                thin_interval=thin_interval, thin_end=thin_end,
-                                iteration=iteration, walkers=walkers,
-                                flatten=flatten)
-
-    def _read_fields(self, fields, array_class,
-                     thin_start=None, thin_interval=None, thin_end=None,
-                     iteration=None, walkers=None, flatten=True):
-        """Base function for reading samples and model stats. See
-        `read_samples` and `read_model_stats` for details.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
-        fields_group : str
-            The name of the group to retrieve the desired fields.
         fields : list
-            The list of field names to retrieve. Must be names of groups in
-            `fp[fields_group/]`.
-        array_class : FieldArray or similar
-            The type of array to return. Must have a `from_kwargs` attribute.
-
-        For other details on keyword arguments, see `read_samples` and
-        `read_model_stats`.
+            The list of field names to retrieve. Must be names of datasets in
+            the ``samples_group``.
 
         Returns
         -------
-        array_class
-            An instance of the given array class populated with values
-            retrieved from the fields.
+        dict
+            A dictionary of field name -> numpy array pairs.
         """
         # walkers to load
         if walkers is not None:
@@ -229,14 +154,14 @@ def _read_fields(self, fields, array_class,
             get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end,
                                      thin_interval=thin_interval)
         # load
+        group = self.samples_group + '/{name}'
         arrays = {}
-        group = fields_group + '/{name}'
         for name in fields:
             arr = fp[group.format(name=name)][widx, get_index]
             if flatten:
                 arr = arr.flatten()
             arrays[name] = arr
-        return array_class.from_kwargs(**arrays)
+        return arrays
 
     def write_resume_point(self):
         """Keeps a list of the number of iterations that were in a file when a

From af6e7b9510222612e1e50bf6c52bca0df23e7557 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Fri, 13 Jul 2018 10:47:41 +0200
Subject: [PATCH 08/47] add emcee file handling

---
 gwin/io/emcee.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 gwin/io/emcee.py

diff --git a/gwin/io/emcee.py b/gwin/io/emcee.py
new file mode 100644
index 0000000..c127da5
--- /dev/null
+++ b/gwin/io/emcee.py
@@ -0,0 +1,75 @@
+# Copyright (C) 2018 Collin Capano
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# self.option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+
+#
+# =============================================================================
+#
+#                                   Preamble
+#
+# =============================================================================
+#
+"""Provides IO for the emcee sampler.
+"""
+
+from .base_hdf import BaseInferenceFile
+from .base_mcmc import EnsembleMCMCIO
+
+class EmceeFile(EnsembleMCMCIO, BaseInferenceFile):
+    """Class to handle file IO for the ``emcee`` sampler."""
+
+    name = 'emcee_file'
+
+    def read_acceptance_fraction(self, walkers=None):
+        """Reads the acceptance fraction from the given file.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            An open file handler to read the samples from.
+        walkers : {None, (list of) int}
+            The walker index (or a list of indices) to retrieve. If None,
+            samples from all walkers will be obtained.
+
+        Returns
+        -------
+        array
+            Array of acceptance fractions with shape (requested walkers,).
+        """
+        group = self.sampler_group + '/acceptance_fraction'
+        if walkers is None:
+            wmask = numpy.ones(self.nwalkers, dtype=bool)
+        else:
+            wmask = numpy.zeros(self.nwalkers, dtype=bool)
+            wmask[walkers] = True
+        return self[group][wmask]
+
+    def write_acceptance_fraction(self, acceptance_fraction):
+        """Write acceptance_fraction data to file. Results are written to
+        `fp[acceptance_fraction]`.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            A file handler to an open inference file.
+        """
+        group = self.sampler_group + '/acceptance_fraction'
+        try:
+            self[group][:] = acceptance_fraction
+        except KeyError:
+            # dataset doesn't exist yet, create it
+            self[group] = acceptance_fraction
+
+

From b089dca8702d589ca873fa3a4878a05777e7bfd2 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Fri, 13 Jul 2018 10:53:08 +0200
Subject: [PATCH 09/47] replace read/write functions with io in BaseSampler

---
 gwin/sampler/base.py | 62 ++++++--------------------------------------
 1 file changed, 8 insertions(+), 54 deletions(-)

diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py
index a7a8fad..542f040 100644
--- a/gwin/sampler/base.py
+++ b/gwin/sampler/base.py
@@ -25,7 +25,7 @@
 Defines the base sampler class to be inherited by all samplers.
 """
 
-from abc import ABCMeta
+from abc import ABCMeta, abstractmethod, abstractproperty
 import numpy
 from pycbc.io import FieldArray
 from pycbc.filter import autocorrelation
@@ -102,58 +102,12 @@ def run(self):
         """
         pass
 
-    @abstractmethod
-    def write_samples(cls, fp, samples, group="samples", **kwargs):
-        """This should write all of the provided samples to the given hdf file.
-
-        This function should be used to write both samples and model stats.
-
-        Parameters
-        ----------
-        fp : open hdf file
-            The file to write to.
-        samples : structure array-like
-            Samples should be provided as a numpy structure array or a
-            FieldArray (basically, anything for which ``samples['param']`` will
-            return a numpy array).
-        group : str, optional
-            The group in ``fp`` to write the ``samples`` to. Default is
-            "samples".
-        \**kwargs :
-            Any other keyword args the sampler needs to write data.
-        """
-        pass
-
-    @abstractmethod
-    def read_samples(cls, fp, parameters, group="samples", **kwargs):
-        """This should read the requested parameters from the given hdf file.
-
-        The samples should be returned as a ``FieldArray``.
-
-        Parameters
-        ----------
-        fp : open hdf file
-            The file to read from.
-        parameters : list of str
-            List of the parameters to return. May include functions.
-        group : str, optional
-            The group in ``fp`` to read the ``samples`` from. Default is
-            "samples".
-        \**kwargs :
-            Any other keyword args the sampler needs to read data.
-        """
-        pass
-
-    @abstractmethod
-    def write_posterior(cls, posterior_fp, **kwargs):
-        """This should write a posterior plus any other metadata to the given
-        file.
-
-        Parameters
-        ----------
-        posterior_fp : open hdf file
-            The file to write to.
-        \**kwargs :
-            Any other keyword args the sampler needs to write the posterior.
+    @abstractproperty
+    def io(self):
+        """A class that inherits from ``BaseInferenceFile`` to handle IO with
+        an hdf file.
+        
+        This should be a class, not an instance of class, so that the sampler
+        can initialize it when needed.
         """
         pass

From 137dc140949855ae3dc67bb599ef11a1b96db359 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Fri, 13 Jul 2018 16:41:08 +0200
Subject: [PATCH 10/47] add checkpoint requirement; rename samples raw_samples

---
 gwin/sampler/base.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py
index 542f040..38b5029 100644
--- a/gwin/sampler/base.py
+++ b/gwin/sampler/base.py
@@ -81,16 +81,20 @@ def static_params(self):
         return self.model.static_params
 
     @abstractproperty
-    def samples(self):
-        """Should return all of the samples currently stored in memory as a
-        numpy structure array or FieldArray.
+    def raw_samples(self):
+        """A dict mapping sampling_params to arrays of samples currently
+        in memory.
+        
+        The sample arrays may have any shape, and may or may not be thinned.
         """
         pass
 
     @abstractproperty
     def model_stats(self):
-        """Should return all of the model's metadata currently stored in
-        memory as a numpy structure array or FieldArray.
+        """A dict mapping model's metadata fields to arrays of values for
+        each sample in ``raw_samples``.
+
+        The arrays may have any shape, and may or may not be thinned.
         """
         pass
 
@@ -111,3 +115,10 @@ def io(self):
         can initialize it when needed.
         """
         pass
+
+    @abstractmethod
+    def checkpoint(self):
+        """The sampler must have a checkpoint method for dumping raw samples
+        and stats to the file type defined by ``io``.
+        """
+        pass

From be9b8de51331d77e365a9d85dd467ff571e979c4 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Fri, 13 Jul 2018 16:41:42 +0200
Subject: [PATCH 11/47] start updating emcee

---
 gwin/sampler/emcee.py | 777 +++---------------------------------------
 1 file changed, 43 insertions(+), 734 deletions(-)

diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py
index 97786b5..7cad975 100644
--- a/gwin/sampler/emcee.py
+++ b/gwin/sampler/emcee.py
@@ -43,7 +43,7 @@
 # =============================================================================
 #
 
-class EmceeEnsembleSampler(BaseMCMCSampler):
+class EmceeEnsembleSampler(BaseMCMC, BaseSampler):
     """This class is used to construct an MCMC sampler from the emcee
     package's EnsembleSampler.
 
@@ -78,9 +78,7 @@ def __init__(self, model, nwalkers, pool=None,
         # to have the same state as the numpy generator
         rstate = numpy.random.get_state()
         sampler.random_state = rstate
-        # initialize
-        super(EmceeEnsembleSampler, self).__init__(
-              sampler, model)
+        self._sampler = sampler
         self._nwalkers = nwalkers
 
     @classmethod
@@ -104,6 +102,47 @@ def from_cli(cls, opts, model, pool=None,
         return cls(model, opts.nwalkers,
                    pool=pool, model_call=model_call)
 
+    @property
+    def raw_samples(self):
+        """A dict mapping sampling_params to arrays of samples currently
+        in memory.
+        
+        The arrays have shape ``nwalkers`` x ``niterations``.
+        """
+        # chain is a [additional dimensions x] niterations x ndim array
+        samples = self.chain
+        sampling_params = self.sampling_params
+        # convert to dictionary to apply boundary conditions
+        samples = {param: samples[..., ii] for
+                   ii, param in enumerate(sampling_params)}
+        samples = self.model._prior.apply_boundary_conditions(
+            **samples)
+        # now convert to field array
+        samples = FieldArray.from_arrays([samples[param]
+                                          for param in sampling_params],
+                                         names=sampling_params)
+        # apply transforms to go to model params space
+        return self.model.apply_sampling_transforms(
+            samples, inverse=True)
+
+    @property
+    def model_stats(self):
+        """Returns the model stats as a FieldArray, with field names
+        corresponding to the type of data returned by the model.
+        The returned array has shape nwalkers x niterations. If no additional
+        stats were returned to the sampler by the model, returns
+        None.
+        """
+        stats = numpy.array(self._sampler.blobs)
+        if stats.size == 0:
+            return None
+        # we'll force arrays to float; this way, if there are `None`s in the
+        # blobs, they will be changed to `nan`s
+        arrays = {field: stats[..., fi].astype(float)
+                  for fi, field in
+                  enumerate(self.model.metadata_fields)}
+        return FieldArray.from_kwargs(**arrays).transpose()
+
     @property
     def lnpost(self):
         """Get the natural logarithm of the likelihood as an
@@ -222,733 +261,3 @@ def write_results(self, fp, start_iteration=None,
                                max_iterations=max_iterations)
         self.write_acceptance_fraction(fp)
         self.write_state(fp)
-
-
-# This is needed for two reason
-# 1) pools freeze state when created and so classes *cannot be updated*
-# 2) methods cannot be pickled.
-class _callprior(object):
-    """Calls the model's prior function, and ensures that no
-    metadata is returned."""
-    def __init__(self, model_call):
-        self.callable = model_call
-
-    def __call__(self, args):
-        prior = self.callable(args, callstat='logprior',
-                              return_all_stats=False)
-        return prior
-
-
-class _callloglikelihood(object):
-    """Calls the model's loglikelihood function.
-    """
-    def __init__(self, model_call):
-        self.callable = model_call
-
-    def __call__(self, args):
-        return self.callable(args, callstat='loglikelihood',
-                             return_all_stats=False)
-
-
-class EmceePTSampler(BaseMCMCSampler):
-    """This class is used to construct a parallel-tempered MCMC sampler from
-    the emcee package's PTSampler.
-
-    Parameters
-    ----------
-    model : model
-        A model from ``gwin.models``.
-    ntemps : int
-        Number of temeratures to use in the sampler.
-    nwalkers : int
-        Number of walkers to use in sampler.
-    pool : function with map, Optional
-        A provider of a map function that allows a function call to be run
-        over multiple sets of arguments and possibly maps them to
-        cores/nodes/etc.
-    """
-    name = "emcee_pt"
-
-    def __init__(self, model, ntemps, nwalkers, pool=None,
-                 model_call=None):
-
-        try:
-            import emcee
-        except ImportError:
-            raise ImportError("emcee is not installed.")
-
-        if model_call is None:
-            model_call = model
-
-        # construct the sampler: PTSampler needs the likelihood and prior
-        # functions separately
-        ndim = len(model.variable_params)
-        sampler = emcee.PTSampler(ntemps, nwalkers, ndim,
-                                  _callloglikelihood(model_call),
-                                  _callprior(model_call),
-                                  pool=pool)
-        # initialize
-        super(EmceePTSampler, self).__init__(
-              sampler, model)
-        self._nwalkers = nwalkers
-        self._ntemps = ntemps
-
-    @classmethod
-    def from_cli(cls, opts, model, pool=None,
-                 model_call=None):
-        """Create an instance of this sampler from the given command-line
-        options.
-
-        Parameters
-        ----------
-        opts : ArgumentParser options
-            The options to parse.
-        model : LikelihoodEvaluator
-            The model to use with the sampler.
-
-        Returns
-        -------
-        EmceePTSampler
-            An emcee sampler initialized based on the given arguments.
-        """
-        return cls(model, opts.ntemps, opts.nwalkers,
-                   pool=pool, model_call=model_call)
-
-    @property
-    def ntemps(self):
-        return self._ntemps
-
-    @property
-    def chain(self):
-        """Get all past samples as an ntemps x nwalker x niterations x ndim
-        array.
-        """
-        # emcee returns the chain as ntemps x nwalker x niterations x ndim
-        return self._sampler.chain
-
-    def clear_chain(self):
-        """Clears the chain and blobs from memory.
-        """
-        # store the iteration that the clear is occuring on
-        self.lastclear = self.niterations
-        # now clear the chain
-        self._sampler.reset()
-
-    @property
-    def model_stats(self):
-        """Returns the log likelihood ratio and log prior as a FieldArray.
-        The returned array has shape ntemps x nwalkers x niterations.
-        """
-        # likelihood has shape ntemps x nwalkers x niterations
-        logl = self._sampler.lnlikelihood
-        # get prior from posterior
-        logp = self._sampler.lnprobability - logl
-        # compute the likelihood ratio
-        loglr = logl - self.model.lognl
-        kwargs = {'loglr': loglr, 'logprior': logp}
-        # if different coordinates were used for sampling, get the jacobian
-        if self.model.sampling_transforms is not None:
-            samples = self.samples
-            # convert to dict
-            d = {param: samples[param] for param in samples.fieldnames}
-            logj = self.model.logjacobian(**d)
-            kwargs['logjacobian'] = logj
-        return FieldArray.from_kwargs(**kwargs)
-
-    @property
-    def lnpost(self):
-        """Get the natural logarithm of the likelihood + the prior as an
-        ntemps x nwalkers x niterations array.
-        """
-        # emcee returns ntemps x nwalkers x niterations
-        return self._sampler.lnprobability
-
-    def set_p0(self, samples_file=None, prior=None):
-        """Sets the initial position of the walkers.
-
-        Parameters
-        ----------
-        samples_file : InferenceFile, optional
-            If provided, use the last iteration in the given file for the
-            starting positions.
-        prior : JointDistribution, optional
-            Use the given prior to set the initial positions rather than
-            ``model``'s prior.
-
-        Returns
-        -------
-        p0 : array
-            An ntemps x nwalkers x ndim array of the initial positions that
-            were set.
-        """
-        # create a (nwalker, ndim) array for initial positions
-        ntemps = self.ntemps
-        nwalkers = self.nwalkers
-        ndim = len(self.variable_params)
-        p0 = numpy.ones((ntemps, nwalkers, ndim))
-        # if samples are given then use those as initial positions
-        if samples_file is not None:
-            samples = self.read_samples(samples_file, self.variable_params,
-                                        iteration=-1, temps='all',
-                                        flatten=False)[..., 0]
-            # transform to sampling parameter space
-            samples = self.model.apply_sampling_transforms(
-                samples)
-        # draw random samples if samples are not provided
-        else:
-            samples = self.model.prior_rvs(
-                size=nwalkers*ntemps, prior=prior).reshape((ntemps, nwalkers))
-        # convert to array
-        for i, param in enumerate(self.sampling_params):
-            p0[..., i] = samples[param]
-        self._p0 = p0
-        return p0
-
-    def run(self, niterations, **kwargs):
-        """Advance the ensemble for a number of samples.
-
-        Parameters
-        ----------
-        niterations : int
-            Number of samples to get from sampler.
-
-        Returns
-        -------
-        p : numpy.array
-            An array of current walker positions with shape (nwalkers, ndim).
-        lnpost : numpy.array
-            The list of log posterior probabilities for the walkers at
-            positions p, with shape (nwalkers, ndim).
-        rstate :
-            The current state of the random number generator.
-        """
-        pos = self._pos
-        if pos is None:
-            pos = self.p0
-        res = self._sampler.run_mcmc(pos, niterations, **kwargs)
-        p, lnpost, rstate = res[0], res[1], res[2]
-        # update the positions
-        self._pos = p
-        return p, lnpost, rstate
-
-    # read/write functions
-
-    # add ntemps and betas to metadata
-    def write_metadata(self, fp, **kwargs):
-        """Writes metadata about this sampler to the given file. Metadata is
-        written to the file's `attrs`.
-
-        Parameters
-        ----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        **kwargs :
-            All keyword arguments are saved as separate arguments in the
-            file attrs. If any keyword argument is a dictionary, the keyword
-            will point to the list of keys in the the file's ``attrs``. Each
-            key is then stored as a separate attr with its corresponding value.
-        """
-        super(EmceePTSampler, self).write_metadata(fp, **kwargs)
-        fp.attrs["ntemps"] = self.ntemps
-        fp.attrs["betas"] = self._sampler.betas
-
-    def write_acceptance_fraction(self, fp):
-        """Write acceptance_fraction data to file. Results are written to
-        `fp[acceptance_fraction/temp{k}]` where k is the temperature.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        """
-        group = "acceptance_fraction/temp{tk}"
-        # acf has shape ntemps x nwalkers
-        acf = self.acceptance_fraction
-        for tk in range(fp.ntemps):
-            try:
-                fp[group.format(tk=tk)][:] = acf[tk, :]
-            except KeyError:
-                # dataset doesn't exist yet, create it
-                fp[group.format(tk=tk)] = acf[tk, :]
-
-    @staticmethod
-    def read_acceptance_fraction(fp, temps=None, walkers=None):
-        """Reads the acceptance fraction from the given file.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
-        temps : {None, (list of) int}
-            The temperature index (or a list of indices) to retrieve. If None,
-            acfs from all temperatures and all walkers will be retrieved.
-        walkers : {None, (list of) int}
-            The walker index (or a list of indices) to retrieve. If None,
-            samples from all walkers will be obtained.
-
-        Returns
-        -------
-        array
-            Array of acceptance fractions with shape (requested temps,
-            requested walkers).
-        """
-        group = 'acceptance_fraction/temp{tk}'
-        if temps is None:
-            temps = numpy.arange(fp.ntemps)
-        if walkers is None:
-            wmask = numpy.ones(fp.nwalkers, dtype=bool)
-        else:
-            wmask = numpy.zeros(fp.nwalkers, dtype=bool)
-            wmask[walkers] = True
-        arrays = []
-        for tk in temps:
-            arrays.extend(fp[group.format(tk=tk)][wmask])
-        return arrays
-
-    @staticmethod
-    def write_samples_group(fp, samples_group, parameters, samples,
-                            start_iteration=None, max_iterations=None):
-        """Writes samples to the given file.
-
-        Results are written to:
-
-            ``fp[samples_group/{vararg}]``,
-
-        where ``{vararg}`` is the name of a variable arg. The samples are
-        written as an ``ntemps x nwalkers x niterations`` array.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        samples_group : str
-            Name of samples group to write.
-        parameters : list
-            The parameters to write to the file.
-        samples : FieldArray
-            The samples to write. Should be a FieldArray with fields containing
-            the samples to write and shape nwalkers x niterations.
-        start_iteration : int, optional
-            Write results to the file's datasets starting at the given
-            iteration. Default is to append after the last iteration in the
-            file.
-        max_iterations : int, optional
-            Set the maximum size that the arrays in the hdf file may be resized
-            to. Only applies if the samples have not previously been written
-            to file. The default (None) is to use the maximum size allowed by
-            h5py.
-        """
-        ntemps, nwalkers, niterations = samples.shape
-        if max_iterations is not None and max_iterations < niterations:
-            raise IndexError("The provided max size is less than the "
-                             "number of iterations")
-        group = samples_group + '/{name}'
-        # loop over number of dimensions
-        for param in parameters:
-            dataset_name = group.format(name=param)
-            istart = start_iteration
-            try:
-                fp_niterations = fp[dataset_name].shape[-1]
-                if istart is None:
-                    istart = fp_niterations
-                istop = istart + niterations
-                if istop > fp_niterations:
-                    # resize the dataset
-                    fp[dataset_name].resize(istop, axis=2)
-            except KeyError:
-                # dataset doesn't exist yet
-                if istart is not None and istart != 0:
-                    raise ValueError("non-zero start_iteration provided, but "
-                                     "dataset doesn't exist yet")
-                istart = 0
-                istop = istart + niterations
-                fp.create_dataset(dataset_name, (ntemps, nwalkers, istop),
-                                  maxshape=(ntemps, nwalkers, max_iterations),
-                                  dtype=float, fletcher32=True)
-            fp[dataset_name][:, :, istart:istop] = samples[param]
-
-    def write_results(self, fp, start_iteration=None, max_iterations=None,
-                      **metadata):
-        """Writes metadata, samples, model stats, and acceptance fraction
-        to the given file. See the write function for each of those for
-        details.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        start_iteration : int, optional
-            Write results to the file's datasets starting at the given
-            iteration. Default is to append after the last iteration in the
-            file.
-        max_iterations : int, optional
-            Set the maximum size that the arrays in the hdf file may be resized
-            to. Only applies if the samples have not previously been written
-            to file. The default (None) is to use the maximum size allowed by
-            h5py.
-        \**metadata :
-            All other keyword arguments are passed to ``write_metadata``.
-        """
-        self.write_metadata(fp, **metadata)
-        self.write_chain(fp, start_iteration=start_iteration,
-                         max_iterations=max_iterations)
-        self.write_model_stats(fp, start_iteration=start_iteration,
-                               max_iterations=max_iterations)
-        self.write_acceptance_fraction(fp)
-        self.write_state(fp)
-
-    @staticmethod
-    def _read_fields(fp, fields_group, fields, array_class,
-                     thin_start=None, thin_interval=None, thin_end=None,
-                     iteration=None, temps=None, walkers=None, flatten=True):
-        """Base function for reading samples and model stats. See
-        `read_samples` and `read_model_stats` for details.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
-        fields_group : str
-            The name of the group to retrieve the desired fields.
-        fields : list
-            The list of field names to retrieve. Must be names of groups in
-            `fp[fields_group/]`.
-        array_class : FieldArray or similar
-            The type of array to return. Must have a `from_kwargs` attribute.
-
-        For other details on keyword arguments, see `read_samples` and
-        `read_model_stats`.
-
-        Returns
-        -------
-        array_class
-            An instance of the given array class populated with values
-            retrieved from the fields.
-        """
-        # walkers to load
-        if walkers is not None:
-            widx = numpy.zeros(fp.nwalkers, dtype=bool)
-            widx[walkers] = True
-            nwalkers = widx.sum()
-        else:
-            widx = slice(None, None)
-            nwalkers = fp.nwalkers
-        # temperatures to load
-        selecttemps = False
-        if temps is None:
-            tidx = 0
-            ntemps = 1
-        elif isinstance(temps, int):
-            tidx = temps
-            ntemps = 1
-        else:
-            # temps is either 'all' or a list of temperatures;
-            # in either case, we'll get all of the temperatures from the file;
-            # if not 'all', then we'll pull out the ones we want
-            tidx = slice(None, None)
-            selecttemps = temps != 'all'
-            if selecttemps:
-                ntemps = len(temps)
-            else:
-                ntemps = fp.ntemps
-        # get the slice to use
-        if iteration is not None:
-            get_index = iteration
-            niterations = 1
-        else:
-            if thin_end is None:
-                # use the number of current iterations
-                thin_end = fp.niterations
-            get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end,
-                                     thin_interval=thin_interval)
-            # we'll just get the number of iterations from the returned shape
-            niterations = None
-        # load
-        arrays = {}
-        group = fields_group + '/{name}'
-        for name in fields:
-            arr = fp[group.format(name=name)][tidx, widx, get_index]
-            if niterations is None:
-                niterations = arr.shape[-1]
-            # pull out the temperatures we need
-            if selecttemps:
-                arr = arr[temps, ...]
-            if flatten:
-                arr = arr.flatten()
-            else:
-                # ensure that the returned array is 3D
-                arr = arr.reshape((ntemps, nwalkers, niterations))
-            arrays[name] = arr
-        return array_class.from_kwargs(**arrays)
-
-    @classmethod
-    def read_samples(cls, fp, parameters,
-                     thin_start=None, thin_interval=None, thin_end=None,
-                     iteration=None, temps=0, walkers=None, flatten=True,
-                     samples_group=None, array_class=None):
-        """Reads samples for the given parameter(s).
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
-        parameters : (list of) strings
-            The parameter(s) to retrieve. A parameter can be the name of any
-            field in `fp[fp.samples_group]`, a virtual field or method of
-            `FieldArray` (as long as the file contains the necessary fields
-            to derive the virtual field or method), and/or a function of
-            these.
-        thin_start : int
-            Index of the sample to begin returning samples. Default is to read
-            samples after burn in. To start from the beginning set thin_start
-            to 0.
-        thin_interval : int
-            Interval to accept every i-th sample. Default is to use the
-            `fp.acl`. If `fp.acl` is not set, then use all samples
-            (set thin_interval to 1).
-        thin_end : int
-            Index of the last sample to read. If not given then
-            `fp.niterations` is used.
-        iteration : int
-            Get a single iteration. If provided, will override the
-            `thin_{start/interval/end}` arguments.
-        walkers : {None, (list of) int}
-            The walker index (or a list of indices) to retrieve. If None,
-            samples from all walkers will be obtained.
-        temps : {None, (list of) int, 'all'}
-            The temperature index (or list of indices) to retrieve. If None,
-            only samples from the coldest (= 0) temperature chain will be
-            retrieved. To retrieve all temperates pass 'all', or a list of
-            all of the temperatures.
-        flatten : {True, bool}
-            The returned array will be one dimensional, with all desired
-            samples from all desired walkers concatenated together. If False,
-            the returned array will have dimension requested temps x requested
-            walkers x requested iterations.
-        samples_group : {None, str}
-            The group in `fp` from which to retrieve the parameter fields. If
-            None, searches in `fp.samples_group`.
-        array_class : {None, array class}
-            The type of array to return. The class must have a `from_kwargs`
-            class method and a `parse_parameters` method. If None, will return
-            a FieldArray.
-
-        Returns
-        -------
-        array_class
-            Samples for the given parameters, as an instance of a the given
-            `array_class` (`FieldArray` if `array_class` is None).
-        """
-        # get the group to load from
-        if samples_group is None:
-            samples_group = fp.samples_group
-        # get the type of array class to use
-        if array_class is None:
-            array_class = FieldArray
-        # get the names of fields needed for the given parameters
-        possible_fields = fp[samples_group].keys()
-        loadfields = array_class.parse_parameters(parameters, possible_fields)
-        return cls._read_fields(
-                fp, samples_group, loadfields, array_class,
-                thin_start=thin_start, thin_interval=thin_interval,
-                thin_end=thin_end, iteration=iteration, temps=temps,
-                walkers=walkers, flatten=flatten)
-
-    @classmethod
-    def compute_acfs(cls, fp, start_index=None, end_index=None,
-                     per_walker=False, walkers=None, parameters=None,
-                     temps=None):
-        """Computes the autocorrleation function of the model params in the
-        given file.
-
-        By default, parameter values are averaged over all walkers at each
-        iteration. The ACF is then calculated over the averaged chain for each
-        temperature. An ACF per-walker will be returned instead if
-        ``per_walker=True``.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
-        start_index : {None, int}
-            The start index to compute the acl from. If None, will try to use
-            the number of burn-in iterations in the file; otherwise, will start
-            at the first sample.
-        end_index : {None, int}
-            The end index to compute the acl to. If None, will go to the end
-            of the current iteration.
-        per_walker : optional, bool
-            Return the ACF for each walker separately. Default is False.
-        walkers : optional, int or array
-            Calculate the ACF using only the given walkers. If None (the
-            default) all walkers will be used.
-        parameters : optional, str or array
-            Calculate the ACF for only the given parameters. If None (the
-            default) will calculate the ACF for all of the model params.
-        temps : optional, (list of) int or 'all'
-            The temperature index (or list of indices) to retrieve. If None
-            (the default), the ACF will only be computed for the coldest (= 0)
-            temperature chain. To compute an ACF for all temperates pass 'all',
-            or a list of all of the temperatures.
-
-        Returns
-        -------
-        FieldArray
-            A ``FieldArray`` of the ACF vs iteration for each parameter. If
-            `per-walker` is True, the FieldArray will have shape
-            ``ntemps x nwalkers x niterations``. Otherwise, the returned
-            array will have shape ``ntemps x niterations``.
-        """
-        acfs = {}
-        if parameters is None:
-            parameters = fp.variable_params
-        if isinstance(parameters, str) or isinstance(parameters, unicode):
-            parameters = [parameters]
-        if isinstance(temps, int):
-            temps = [temps]
-        elif temps == 'all':
-            temps = numpy.arange(fp.ntemps)
-        elif temps is None:
-            temps = [0]
-        for param in parameters:
-            subacfs = []
-            for tk in temps:
-                if per_walker:
-                    # just call myself with a single walker
-                    if walkers is None:
-                        walkers = numpy.arange(fp.nwalkers)
-                    arrays = [cls.compute_acfs(fp, start_index=start_index,
-                                               end_index=end_index,
-                                               per_walker=False, walkers=ii,
-                                               parameters=param,
-                                               temps=tk)[param][0, :]
-                              for ii in walkers]
-                    # we'll stack all of the walker arrays to make a single
-                    # nwalkers x niterations array; when these are stacked
-                    # below, we'll get a ntemps x nwalkers x niterations array
-                    subacfs.append(numpy.vstack(arrays))
-                else:
-                    samples = cls.read_samples(fp, param,
-                                               thin_start=start_index,
-                                               thin_interval=1,
-                                               thin_end=end_index,
-                                               walkers=walkers, temps=tk,
-                                               flatten=False)[param]
-                    # contract the walker dimension using the mean, and flatten
-                    # the (length 1) temp dimension
-                    samples = samples.mean(axis=1)[0, :]
-                    thisacf = autocorrelation.calculate_acf(samples).numpy()
-                    subacfs.append(thisacf)
-            # stack the temperatures
-            # FIXME: the following if/else can be condensed to a single line
-            # using numpy.stack, once the version requirements are bumped to
-            # numpy >= 1.10
-            if per_walker:
-                nw, ni = subacfs[0].shape
-                acfs[param] = numpy.zeros((len(temps), nw, ni), dtype=float)
-                for tk in range(len(temps)):
-                    acfs[param][tk, ...] = subacfs[tk]
-            else:
-                acfs[param] = numpy.vstack(subacfs)
-        return FieldArray.from_kwargs(**acfs)
-
-    @classmethod
-    def compute_acls(cls, fp, start_index=None, end_index=None):
-        """Computes the autocorrleation length for all model params and
-        temperatures in the given file.
-
-        Parameter values are averaged over all walkers at each iteration and
-        temperature.  The ACL is then calculated over the averaged chain. If
-        the returned ACL is `inf`,  will default to the number of current
-        iterations.
-
-        Parameters
-        -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
-        start_index : {None, int}
-            The start index to compute the acl from. If None, will try to use
-            the number of burn-in iterations in the file; otherwise, will start
-            at the first sample.
-        end_index : {None, int}
-            The end index to compute the acl to. If None, will go to the end
-            of the current iteration.
-
-        Returns
-        -------
-        dict
-            A dictionary of ntemps-long arrays of the ACLs of each parameter.
-        """
-        acls = {}
-        if end_index is None:
-            end_index = fp.niterations
-        tidx = numpy.arange(fp.ntemps)
-        for param in fp.variable_params:
-            these_acls = numpy.zeros(fp.ntemps, dtype=int)
-            for tk in tidx:
-                samples = cls.read_samples(fp, param, thin_start=start_index,
-                                           thin_interval=1, thin_end=end_index,
-                                           temps=tk, flatten=False)[param]
-                # contract the walker dimension using the mean, and flatten
-                # the (length 1) temp dimension
-                samples = samples.mean(axis=1)[0, :]
-                acl = autocorrelation.calculate_acl(samples)
-                if numpy.isinf(acl):
-                    acl = samples.size
-                these_acls[tk] = acl
-            acls[param] = these_acls
-        return acls
-
-    @classmethod
-    def calculate_logevidence(cls, fp, thin_start=None, thin_end=None,
-                              thin_interval=None):
-        """Calculates the log evidence from the given file using emcee's
-        thermodynamic integration.
-
-        Parameters
-        ----------
-        fp : InferenceFile
-            An open file handler to read the stats from.
-        thin_start : int
-            Index of the sample to begin returning stats. Default is to read
-            stats after burn in. To start from the beginning set thin_start
-            to 0.
-        thin_interval : int
-            Interval to accept every i-th sample. Default is to use the
-            `fp.acl`. If `fp.acl` is not set, then use all stats
-            (set thin_interval to 1).
-        thin_end : int
-            Index of the last sample to read. If not given then
-            `fp.niterations` is used.
-
-        Returns
-        -------
-        lnZ : float
-            The estimate of log of the evidence.
-        dlnZ : float
-            The error on the estimate.
-        """
-        try:
-            import emcee
-        except ImportError:
-            raise ImportError("emcee is not installed.")
-
-        stats_group = fp.stats_group
-        parameters = fp[stats_group].keys()
-        logstats = cls.read_samples(fp, parameters, samples_group=stats_group,
-                                    thin_start=thin_start,  thin_end=thin_end,
-                                    thin_interval=thin_interval,
-                                    temps='all', flatten=False)
-        # get the likelihoods
-        logls = logstats['loglr'] + fp.lognl
-        # we need the betas that were used
-        betas = fp.attrs['betas']
-        # annoyingly, theromdynaimc integration in PTSampler is an instance
-        # method, so we'll implement a dummy one
-        ntemps = fp.ntemps
-        nwalkers = fp.nwalkers
-        ndim = len(fp.variable_params)
-        dummy_sampler = emcee.PTSampler(ntemps, nwalkers, ndim, None,
-                                        None, betas=betas)
-        return dummy_sampler.thermodynamic_integration_log_evidence(
-            logls=logls, fburnin=0.)

From f2b04f3d896da4b9d69213ad4b1268d23a449dc2 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Fri, 13 Jul 2018 16:42:13 +0200
Subject: [PATCH 12/47] move emcee_pt to it's own module

---
 gwin/sampler/emcee_pt.py | 754 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 754 insertions(+)
 create mode 100644 gwin/sampler/emcee_pt.py

diff --git a/gwin/sampler/emcee_pt.py b/gwin/sampler/emcee_pt.py
new file mode 100644
index 0000000..8cb6605
--- /dev/null
+++ b/gwin/sampler/emcee_pt.py
@@ -0,0 +1,754 @@
+# Copyright (C) 2016  Collin Capano
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+
+#
+# =============================================================================
+#
+#                                   Preamble
+#
+# =============================================================================
+#
+"""
+This modules provides classes and functions for using the emcee sampler
+packages for parameter estimation.
+"""
+
+# This is needed for two reason
+# 1) pools freeze state when created and so classes *cannot be updated*
+# 2) methods cannot be pickled.
+class _callprior(object):
+    """Calls the model's prior function, and ensures that no
+    metadata is returned."""
+    def __init__(self, model_call):
+        self.callable = model_call
+
+    def __call__(self, args):
+        prior = self.callable(args, callfunc='prior')
+        return prior if isinstance(prior, numpy.float64) else prior[0]
+
+
+class _callloglikelihood(object):
+    """Calls the model's loglikelihood function.
+    """
+    def __init__(self, model_call):
+        self.callable = model_call
+
+    def __call__(self, args):
+        return self.callable(args, callfunc='loglikelihood')
+
+
+class EmceePTSampler(BaseMCMCSampler):
+    """This class is used to construct a parallel-tempered MCMC sampler from
+    the emcee package's PTSampler.
+
+    Parameters
+    ----------
+    model : model
+        A model from ``gwin.models``.
+    ntemps : int
+        Number of temeratures to use in the sampler.
+    nwalkers : int
+        Number of walkers to use in sampler.
+    pool : function with map, Optional
+        A provider of a map function that allows a function call to be run
+        over multiple sets of arguments and possibly maps them to
+        cores/nodes/etc.
+    """
+    name = "emcee_pt"
+
+    def __init__(self, model, ntemps, nwalkers, pool=None,
+                 model_call=None):
+
+        try:
+            import emcee
+        except ImportError:
+            raise ImportError("emcee is not installed.")
+
+        if model_call is None:
+            model_call = model
+
+        # construct the sampler: PTSampler needs the likelihood and prior
+        # functions separately
+        ndim = len(model.variable_params)
+        sampler = emcee.PTSampler(ntemps, nwalkers, ndim,
+                                  _callloglikelihood(model_call),
+                                  _callprior(model_call),
+                                  pool=pool)
+        # initialize
+        super(EmceePTSampler, self).__init__(
+              sampler, model)
+        self._nwalkers = nwalkers
+        self._ntemps = ntemps
+
+    @classmethod
+    def from_cli(cls, opts, model, pool=None,
+                 model_call=None):
+        """Create an instance of this sampler from the given command-line
+        options.
+
+        Parameters
+        ----------
+        opts : ArgumentParser options
+            The options to parse.
+        model : LikelihoodEvaluator
+            The model to use with the sampler.
+
+        Returns
+        -------
+        EmceePTSampler
+            An emcee sampler initialized based on the given arguments.
+        """
+        return cls(model, opts.ntemps, opts.nwalkers,
+                   pool=pool, model_call=model_call)
+
+    @property
+    def ntemps(self):
+        return self._ntemps
+
+    @property
+    def chain(self):
+        """Get all past samples as an ntemps x nwalker x niterations x ndim
+        array.
+        """
+        # emcee returns the chain as ntemps x nwalker x niterations x ndim
+        return self._sampler.chain
+
+    def clear_chain(self):
+        """Clears the chain and blobs from memory.
+        """
+        # store the iteration that the clear is occuring on
+        self.lastclear = self.niterations
+        # now clear the chain
+        self._sampler.reset()
+
+    @property
+    def model_stats(self):
+        """Returns the log likelihood ratio and log prior as a FieldArray.
+        The returned array has shape ntemps x nwalkers x niterations.
+        """
+        # likelihood has shape ntemps x nwalkers x niterations
+        logl = self._sampler.lnlikelihood
+        # get prior from posterior
+        logp = self._sampler.lnprobability - logl
+        # compute the likelihood ratio
+        loglr = logl - self.model.lognl
+        kwargs = {'loglr': loglr, 'prior': logp}
+        # if different coordinates were used for sampling, get the jacobian
+        if self.model.sampling_transforms is not None:
+            samples = self.samples
+            # convert to dict
+            d = {param: samples[param] for param in samples.fieldnames}
+            logj = self.model.logjacobian(**d)
+            kwargs['logjacobian'] = logj
+        return FieldArray.from_kwargs(**kwargs)
+
+    @property
+    def lnpost(self):
+        """Get the natural logarithm of the likelihood + the prior as an
+        ntemps x nwalkers x niterations array.
+        """
+        # emcee returns ntemps x nwalkers x niterations
+        return self._sampler.lnprobability
+
+    def set_p0(self, samples_file=None, prior=None):
+        """Sets the initial position of the walkers.
+
+        Parameters
+        ----------
+        samples_file : InferenceFile, optional
+            If provided, use the last iteration in the given file for the
+            starting positions.
+        prior : JointDistribution, optional
+            Use the given prior to set the initial positions rather than
+            ``model``'s prior.
+
+        Returns
+        -------
+        p0 : array
+            An ntemps x nwalkers x ndim array of the initial positions that
+            were set.
+        """
+        # create a (nwalker, ndim) array for initial positions
+        ntemps = self.ntemps
+        nwalkers = self.nwalkers
+        ndim = len(self.variable_params)
+        p0 = numpy.ones((ntemps, nwalkers, ndim))
+        # if samples are given then use those as initial positions
+        if samples_file is not None:
+            samples = self.read_samples(samples_file, self.variable_params,
+                                        iteration=-1, temps='all',
+                                        flatten=False)[..., 0]
+            # transform to sampling parameter space
+            samples = self.model.apply_sampling_transforms(
+                samples)
+        # draw random samples if samples are not provided
+        else:
+            samples = self.model.prior_rvs(
+                size=nwalkers*ntemps, prior=prior).reshape((ntemps, nwalkers))
+        # convert to array
+        for i, param in enumerate(self.sampling_params):
+            p0[..., i] = samples[param]
+        self._p0 = p0
+        return p0
+
+    def run(self, niterations, **kwargs):
+        """Advance the ensemble for a number of samples.
+
+        Parameters
+        ----------
+        niterations : int
+            Number of samples to get from sampler.
+
+        Returns
+        -------
+        p : numpy.array
+            An array of current walker positions with shape (nwalkers, ndim).
+        lnpost : numpy.array
+            The list of log posterior probabilities for the walkers at
+            positions p, with shape (nwalkers, ndim).
+        rstate :
+            The current state of the random number generator.
+        """
+        pos = self._pos
+        if pos is None:
+            pos = self.p0
+        res = self._sampler.run_mcmc(pos, niterations, **kwargs)
+        p, lnpost, rstate = res[0], res[1], res[2]
+        # update the positions
+        self._pos = p
+        return p, lnpost, rstate
+
+    # read/write functions
+
+    # add ntemps and betas to metadata
+    def write_metadata(self, fp, **kwargs):
+        """Writes metadata about this sampler to the given file. Metadata is
+        written to the file's `attrs`.
+
+        Parameters
+        ----------
+        fp : InferenceFile
+            A file handler to an open inference file.
+        **kwargs :
+            All keyword arguments are saved as separate arguments in the
+            file attrs. If any keyword argument is a dictionary, the keyword
+            will point to the list of keys in the the file's ``attrs``. Each
+            key is then stored as a separate attr with its corresponding value.
+        """
+        super(EmceePTSampler, self).write_metadata(fp, **kwargs)
+        fp.attrs["ntemps"] = self.ntemps
+        fp.attrs["betas"] = self._sampler.betas
+
+    def write_acceptance_fraction(self, fp):
+        """Write acceptance_fraction data to file. Results are written to
+        `fp[acceptance_fraction/temp{k}]` where k is the temperature.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            A file handler to an open inference file.
+        """
+        group = "acceptance_fraction/temp{tk}"
+        # acf has shape ntemps x nwalkers
+        acf = self.acceptance_fraction
+        for tk in range(fp.ntemps):
+            try:
+                fp[group.format(tk=tk)][:] = acf[tk, :]
+            except KeyError:
+                # dataset doesn't exist yet, create it
+                fp[group.format(tk=tk)] = acf[tk, :]
+
+    @staticmethod
+    def read_acceptance_fraction(fp, temps=None, walkers=None):
+        """Reads the acceptance fraction from the given file.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            An open file handler to read the samples from.
+        temps : {None, (list of) int}
+            The temperature index (or a list of indices) to retrieve. If None,
+            acfs from all temperatures and all walkers will be retrieved.
+        walkers : {None, (list of) int}
+            The walker index (or a list of indices) to retrieve. If None,
+            samples from all walkers will be obtained.
+
+        Returns
+        -------
+        array
+            Array of acceptance fractions with shape (requested temps,
+            requested walkers).
+        """
+        group = 'acceptance_fraction/temp{tk}'
+        if temps is None:
+            temps = numpy.arange(fp.ntemps)
+        if walkers is None:
+            wmask = numpy.ones(fp.nwalkers, dtype=bool)
+        else:
+            wmask = numpy.zeros(fp.nwalkers, dtype=bool)
+            wmask[walkers] = True
+        arrays = []
+        for tk in temps:
+            arrays.extend(fp[group.format(tk=tk)][wmask])
+        return arrays
+
+    @staticmethod
+    def write_samples_group(fp, samples_group, parameters, samples,
+                            start_iteration=None, max_iterations=None):
+        """Writes samples to the given file.
+
+        Results are written to:
+
+            ``fp[samples_group/{vararg}]``,
+
+        where ``{vararg}`` is the name of a variable arg. The samples are
+        written as an ``ntemps x nwalkers x niterations`` array.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            A file handler to an open inference file.
+        samples_group : str
+            Name of samples group to write.
+        parameters : list
+            The parameters to write to the file.
+        samples : FieldArray
+            The samples to write. Should be a FieldArray with fields containing
+            the samples to write and shape nwalkers x niterations.
+        start_iteration : int, optional
+            Write results to the file's datasets starting at the given
+            iteration. Default is to append after the last iteration in the
+            file.
+        max_iterations : int, optional
+            Set the maximum size that the arrays in the hdf file may be resized
+            to. Only applies if the samples have not previously been written
+            to file. The default (None) is to use the maximum size allowed by
+            h5py.
+        """
+        ntemps, nwalkers, niterations = samples.shape
+        if max_iterations is not None and max_iterations < niterations:
+            raise IndexError("The provided max size is less than the "
+                             "number of iterations")
+        group = samples_group + '/{name}'
+        # loop over number of dimensions
+        for param in parameters:
+            dataset_name = group.format(name=param)
+            istart = start_iteration
+            try:
+                fp_niterations = fp[dataset_name].shape[-1]
+                if istart is None:
+                    istart = fp_niterations
+                istop = istart + niterations
+                if istop > fp_niterations:
+                    # resize the dataset
+                    fp[dataset_name].resize(istop, axis=2)
+            except KeyError:
+                # dataset doesn't exist yet
+                if istart is not None and istart != 0:
+                    raise ValueError("non-zero start_iteration provided, but "
+                                     "dataset doesn't exist yet")
+                istart = 0
+                istop = istart + niterations
+                fp.create_dataset(dataset_name, (ntemps, nwalkers, istop),
+                                  maxshape=(ntemps, nwalkers, max_iterations),
+                                  dtype=float, fletcher32=True)
+            fp[dataset_name][:, :, istart:istop] = samples[param]
+
+    def write_results(self, fp, start_iteration=None, max_iterations=None,
+                      **metadata):
+        """Writes metadata, samples, model stats, and acceptance fraction
+        to the given file. See the write function for each of those for
+        details.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            A file handler to an open inference file.
+        start_iteration : int, optional
+            Write results to the file's datasets starting at the given
+            iteration. Default is to append after the last iteration in the
+            file.
+        max_iterations : int, optional
+            Set the maximum size that the arrays in the hdf file may be resized
+            to. Only applies if the samples have not previously been written
+            to file. The default (None) is to use the maximum size allowed by
+            h5py.
+        \**metadata :
+            All other keyword arguments are passed to ``write_metadata``.
+        """
+        self.write_metadata(fp, **metadata)
+        self.write_chain(fp, start_iteration=start_iteration,
+                         max_iterations=max_iterations)
+        self.write_model_stats(fp, start_iteration=start_iteration,
+                               max_iterations=max_iterations)
+        self.write_acceptance_fraction(fp)
+        self.write_state(fp)
+
+    @staticmethod
+    def _read_fields(fp, fields_group, fields, array_class,
+                     thin_start=None, thin_interval=None, thin_end=None,
+                     iteration=None, temps=None, walkers=None, flatten=True):
+        """Base function for reading samples and model stats. See
+        `read_samples` and `read_model_stats` for details.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            An open file handler to read the samples from.
+        fields_group : str
+            The name of the group to retrieve the desired fields.
+        fields : list
+            The list of field names to retrieve. Must be names of groups in
+            `fp[fields_group/]`.
+        array_class : FieldArray or similar
+            The type of array to return. Must have a `from_kwargs` attribute.
+
+        For other details on keyword arguments, see `read_samples` and
+        `read_model_stats`.
+
+        Returns
+        -------
+        array_class
+            An instance of the given array class populated with values
+            retrieved from the fields.
+        """
+        # walkers to load
+        if walkers is not None:
+            widx = numpy.zeros(fp.nwalkers, dtype=bool)
+            widx[walkers] = True
+            nwalkers = widx.sum()
+        else:
+            widx = slice(None, None)
+            nwalkers = fp.nwalkers
+        # temperatures to load
+        selecttemps = False
+        if temps is None:
+            tidx = 0
+            ntemps = 1
+        elif isinstance(temps, int):
+            tidx = temps
+            ntemps = 1
+        else:
+            # temps is either 'all' or a list of temperatures;
+            # in either case, we'll get all of the temperatures from the file;
+            # if not 'all', then we'll pull out the ones we want
+            tidx = slice(None, None)
+            selecttemps = temps != 'all'
+            if selecttemps:
+                ntemps = len(temps)
+            else:
+                ntemps = fp.ntemps
+        # get the slice to use
+        if iteration is not None:
+            get_index = iteration
+            niterations = 1
+        else:
+            if thin_end is None:
+                # use the number of current iterations
+                thin_end = fp.niterations
+            get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end,
+                                     thin_interval=thin_interval)
+            # we'll just get the number of iterations from the returned shape
+            niterations = None
+        # load
+        arrays = {}
+        group = fields_group + '/{name}'
+        for name in fields:
+            arr = fp[group.format(name=name)][tidx, widx, get_index]
+            if niterations is None:
+                niterations = arr.shape[-1]
+            # pull out the temperatures we need
+            if selecttemps:
+                arr = arr[temps, ...]
+            if flatten:
+                arr = arr.flatten()
+            else:
+                # ensure that the returned array is 3D
+                arr = arr.reshape((ntemps, nwalkers, niterations))
+            arrays[name] = arr
+        return array_class.from_kwargs(**arrays)
+
+    @classmethod
+    def read_samples(cls, fp, parameters,
+                     thin_start=None, thin_interval=None, thin_end=None,
+                     iteration=None, temps=0, walkers=None, flatten=True,
+                     samples_group=None, array_class=None):
+        """Reads samples for the given parameter(s).
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            An open file handler to read the samples from.
+        parameters : (list of) strings
+            The parameter(s) to retrieve. A parameter can be the name of any
+            field in `fp[fp.samples_group]`, a virtual field or method of
+            `FieldArray` (as long as the file contains the necessary fields
+            to derive the virtual field or method), and/or a function of
+            these.
+        thin_start : int
+            Index of the sample to begin returning samples. Default is to read
+            samples after burn in. To start from the beginning set thin_start
+            to 0.
+        thin_interval : int
+            Interval to accept every i-th sample. Default is to use the
+            `fp.acl`. If `fp.acl` is not set, then use all samples
+            (set thin_interval to 1).
+        thin_end : int
+            Index of the last sample to read. If not given then
+            `fp.niterations` is used.
+        iteration : int
+            Get a single iteration. If provided, will override the
+            `thin_{start/interval/end}` arguments.
+        walkers : {None, (list of) int}
+            The walker index (or a list of indices) to retrieve. If None,
+            samples from all walkers will be obtained.
+        temps : {None, (list of) int, 'all'}
+            The temperature index (or list of indices) to retrieve. If None,
+            only samples from the coldest (= 0) temperature chain will be
+            retrieved. To retrieve all temperates pass 'all', or a list of
+            all of the temperatures.
+        flatten : {True, bool}
+            The returned array will be one dimensional, with all desired
+            samples from all desired walkers concatenated together. If False,
+            the returned array will have dimension requested temps x requested
+            walkers x requested iterations.
+        samples_group : {None, str}
+            The group in `fp` from which to retrieve the parameter fields. If
+            None, searches in `fp.samples_group`.
+        array_class : {None, array class}
+            The type of array to return. The class must have a `from_kwargs`
+            class method and a `parse_parameters` method. If None, will return
+            a FieldArray.
+
+        Returns
+        -------
+        array_class
+            Samples for the given parameters, as an instance of a the given
+            `array_class` (`FieldArray` if `array_class` is None).
+        """
+        # get the group to load from
+        if samples_group is None:
+            samples_group = fp.samples_group
+        # get the type of array class to use
+        if array_class is None:
+            array_class = FieldArray
+        # get the names of fields needed for the given parameters
+        possible_fields = fp[samples_group].keys()
+        loadfields = array_class.parse_parameters(parameters, possible_fields)
+        return cls._read_fields(
+                fp, samples_group, loadfields, array_class,
+                thin_start=thin_start, thin_interval=thin_interval,
+                thin_end=thin_end, iteration=iteration, temps=temps,
+                walkers=walkers, flatten=flatten)
+
+    @classmethod
+    def compute_acfs(cls, fp, start_index=None, end_index=None,
+                     per_walker=False, walkers=None, parameters=None,
+                     temps=None):
+        """Computes the autocorrleation function of the model params in the
+        given file.
+
+        By default, parameter values are averaged over all walkers at each
+        iteration. The ACF is then calculated over the averaged chain for each
+        temperature. An ACF per-walker will be returned instead if
+        ``per_walker=True``.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            An open file handler to read the samples from.
+        start_index : {None, int}
+            The start index to compute the acl from. If None, will try to use
+            the number of burn-in iterations in the file; otherwise, will start
+            at the first sample.
+        end_index : {None, int}
+            The end index to compute the acl to. If None, will go to the end
+            of the current iteration.
+        per_walker : optional, bool
+            Return the ACF for each walker separately. Default is False.
+        walkers : optional, int or array
+            Calculate the ACF using only the given walkers. If None (the
+            default) all walkers will be used.
+        parameters : optional, str or array
+            Calculate the ACF for only the given parameters. If None (the
+            default) will calculate the ACF for all of the model params.
+        temps : optional, (list of) int or 'all'
+            The temperature index (or list of indices) to retrieve. If None
+            (the default), the ACF will only be computed for the coldest (= 0)
+            temperature chain. To compute an ACF for all temperates pass 'all',
+            or a list of all of the temperatures.
+
+        Returns
+        -------
+        FieldArray
+            A ``FieldArray`` of the ACF vs iteration for each parameter. If
+            `per-walker` is True, the FieldArray will have shape
+            ``ntemps x nwalkers x niterations``. Otherwise, the returned
+            array will have shape ``ntemps x niterations``.
+        """
+        acfs = {}
+        if parameters is None:
+            parameters = fp.variable_params
+        if isinstance(parameters, str) or isinstance(parameters, unicode):
+            parameters = [parameters]
+        if isinstance(temps, int):
+            temps = [temps]
+        elif temps == 'all':
+            temps = numpy.arange(fp.ntemps)
+        elif temps is None:
+            temps = [0]
+        for param in parameters:
+            subacfs = []
+            for tk in temps:
+                if per_walker:
+                    # just call myself with a single walker
+                    if walkers is None:
+                        walkers = numpy.arange(fp.nwalkers)
+                    arrays = [cls.compute_acfs(fp, start_index=start_index,
+                                               end_index=end_index,
+                                               per_walker=False, walkers=ii,
+                                               parameters=param,
+                                               temps=tk)[param][0, :]
+                              for ii in walkers]
+                    # we'll stack all of the walker arrays to make a single
+                    # nwalkers x niterations array; when these are stacked
+                    # below, we'll get a ntemps x nwalkers x niterations array
+                    subacfs.append(numpy.vstack(arrays))
+                else:
+                    samples = cls.read_samples(fp, param,
+                                               thin_start=start_index,
+                                               thin_interval=1,
+                                               thin_end=end_index,
+                                               walkers=walkers, temps=tk,
+                                               flatten=False)[param]
+                    # contract the walker dimension using the mean, and flatten
+                    # the (length 1) temp dimension
+                    samples = samples.mean(axis=1)[0, :]
+                    thisacf = autocorrelation.calculate_acf(samples).numpy()
+                    subacfs.append(thisacf)
+            # stack the temperatures
+            # FIXME: the following if/else can be condensed to a single line
+            # using numpy.stack, once the version requirements are bumped to
+            # numpy >= 1.10
+            if per_walker:
+                nw, ni = subacfs[0].shape
+                acfs[param] = numpy.zeros((len(temps), nw, ni), dtype=float)
+                for tk in range(len(temps)):
+                    acfs[param][tk, ...] = subacfs[tk]
+            else:
+                acfs[param] = numpy.vstack(subacfs)
+        return FieldArray.from_kwargs(**acfs)
+
+    @classmethod
+    def compute_acls(cls, fp, start_index=None, end_index=None):
+        """Computes the autocorrleation length for all model params and
+        temperatures in the given file.
+
+        Parameter values are averaged over all walkers at each iteration and
+        temperature.  The ACL is then calculated over the averaged chain. If
+        the returned ACL is `inf`,  will default to the number of current
+        iterations.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            An open file handler to read the samples from.
+        start_index : {None, int}
+            The start index to compute the acl from. If None, will try to use
+            the number of burn-in iterations in the file; otherwise, will start
+            at the first sample.
+        end_index : {None, int}
+            The end index to compute the acl to. If None, will go to the end
+            of the current iteration.
+
+        Returns
+        -------
+        dict
+            A dictionary of ntemps-long arrays of the ACLs of each parameter.
+        """
+        acls = {}
+        if end_index is None:
+            end_index = fp.niterations
+        tidx = numpy.arange(fp.ntemps)
+        for param in fp.variable_params:
+            these_acls = numpy.zeros(fp.ntemps, dtype=int)
+            for tk in tidx:
+                samples = cls.read_samples(fp, param, thin_start=start_index,
+                                           thin_interval=1, thin_end=end_index,
+                                           temps=tk, flatten=False)[param]
+                # contract the walker dimension using the mean, and flatten
+                # the (length 1) temp dimension
+                samples = samples.mean(axis=1)[0, :]
+                acl = autocorrelation.calculate_acl(samples)
+                if numpy.isinf(acl):
+                    acl = samples.size
+                these_acls[tk] = acl
+            acls[param] = these_acls
+        return acls
+
+    @classmethod
+    def calculate_logevidence(cls, fp, thin_start=None, thin_end=None,
+                              thin_interval=None):
+        """Calculates the log evidence from the given file using emcee's
+        thermodynamic integration.
+
+        Parameters
+        ----------
+        fp : InferenceFile
+            An open file handler to read the stats from.
+        thin_start : int
+            Index of the sample to begin returning stats. Default is to read
+            stats after burn in. To start from the beginning set thin_start
+            to 0.
+        thin_interval : int
+            Interval to accept every i-th sample. Default is to use the
+            `fp.acl`. If `fp.acl` is not set, then use all stats
+            (set thin_interval to 1).
+        thin_end : int
+            Index of the last sample to read. If not given then
+            `fp.niterations` is used.
+
+        Returns
+        -------
+        lnZ : float
+            The estimate of log of the evidence.
+        dlnZ : float
+            The error on the estimate.
+        """
+        try:
+            import emcee
+        except ImportError:
+            raise ImportError("emcee is not installed.")
+
+        stats_group = fp.stats_group
+        parameters = fp[stats_group].keys()
+        logstats = cls.read_samples(fp, parameters, samples_group=stats_group,
+                                    thin_start=thin_start,  thin_end=thin_end,
+                                    thin_interval=thin_interval,
+                                    temps='all', flatten=False)
+        # get the likelihoods
+        logls = logstats['loglr'] + fp.lognl
+        # we need the betas that were used
+        betas = fp.attrs['betas']
+        # annoyingly, theromdynaimc integration in PTSampler is an instance
+        # method, so we'll implement a dummy one
+        ntemps = fp.ntemps
+        nwalkers = fp.nwalkers
+        ndim = len(fp.variable_params)
+        dummy_sampler = emcee.PTSampler(ntemps, nwalkers, ndim, None,
+                                        None, betas=betas)
+        return dummy_sampler.thermodynamic_integration_log_evidence(
+            logls=logls, fburnin=0.)

From 5f9c0915b77d720d847c492fd69d8b8a6564353f Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Fri, 13 Jul 2018 16:44:43 +0200
Subject: [PATCH 13/47] add base_mcmc (needs work)

---
 gwin/sampler/base_mcmc.py | 344 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 344 insertions(+)
 create mode 100644 gwin/sampler/base_mcmc.py

diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
new file mode 100644
index 0000000..69beb75
--- /dev/null
+++ b/gwin/sampler/base_mcmc.py
@@ -0,0 +1,344 @@
+# Copyright (C) 2016  Christopher M. Biwer, Collin Capano
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+
+#
+# =============================================================================
+#
+#                                   Preamble
+#
+# =============================================================================
+#
+"""Provides constructor classes for MCMC samplers."""
+
+from abc import ABCMeta, abstractmethod, abstractproperty
+
+class BaseMCMC(object):
+    """This class provides methods common to MCMCs.
+
+    It is not a sampler class itself. Sampler classes can inherit from this
+    along with ``BaseSampler``.
+
+    Attributes
+    ----------
+    p0 : dict
+        A dictionary of the initial position of the walkers. Set by using
+        ``set_p0``. If not set yet, a ``ValueError`` is raised when the
+        attribute is accessed.
+    pos : dict
+        A dictionary of the current walker positions. If the sampler hasn't
+        been run yet, returns p0.
+    """
+    __metaclass__ = ABCMeta
+
+    lastclear = None
+    _itercounter = None
+    _pos = None
+    _p0 = None
+    _nwalkers = None
+
+    @abstractproperty(self):
+    def samples_shape(self):
+        """Should define what shape to expect samples to be in."""
+        pass
+
+    @property
+    def nwalkers(self):
+        """Get the number of walkers."""
+        if self._nwalkers is None:
+            raise ValueError("number of walkers not set")
+        return self._nwalkers
+
+    @property
+    def niterations(self):
+        """Get the current number of iterations."""
+        itercounter = self._itercounter
+        if _itercounter is None:
+            itercounter = 0
+        lastclear = self.lastclear
+        if lastclear is None:
+            lastclear = 0
+        return itercounter + lastclear
+
+    @property
+    def pos(self):
+        pos = self._pos
+        if pos is None:
+            return self.p0
+        # convert to dict
+        pos = {param: self._pos[..., k]
+               for (k, param) in enumerate(self.sampling_params)}
+        return pos
+
+    @property
+    def p0(self):
+        """The starting position of the walkers in the sampling param space.
+        
+        The returned object is a dict mapping the sampling parameters to the
+        values.
+        """
+        if self._p0 is None:
+            raise ValueError("initial positions not set; run set_p0")
+        # convert to dict
+        p0 = {param: self._p0[..., k]
+              for (k, param) in enumerate(self.sampling_params)}
+        return p0
+
+    def set_p0(self, samples_file=None, prior=None):
+        """Sets the initial position of the walkers.
+
+        Parameters
+        ----------
+        samples_file : InferenceFile, optional
+            If provided, use the last iteration in the given file for the
+            starting positions.
+        prior : JointDistribution, optional
+            Use the given prior to set the initial positions rather than
+            ``model``'s prior.
+
+        Returns
+        -------
+        p0 : dict
+            A dictionary maping sampling params to the starting positions.
+        """
+        # if samples are given then use those as initial positions
+        if samples_file is not None:
+            with self.io(samples_file, 'r') as fp:
+                samples = fp.read_samples(self.variable_params,
+                                          iteration=-1)
+                # make sure we have the same shape
+                assert(samples.shape == self.samples_shape,
+                       "samples in file {} have shape {}, but I have shape {}".
+                       format(samples_file, samples.shape, self.samples_shape))
+            # transform to sampling parameter space
+            samples = self.model.apply_sampling_transforms(samples)
+        # draw random samples if samples are not provided
+        else:
+            nsamples = numpy.prod(self.samples_shape)
+            samples = self.model.prior_rvs(size=nsamples, prior=prior).reshape(
+                self.samples_shape)
+        # store as ND array with shape [samples_shape] x nparams
+        ndim = len(self.variable_params)
+        p0 = numpy.ones(list(self.samples_shape)+[ndim])
+        for i, param in enumerate(self.sampling_params):
+            p0[..., i] = samples[param]
+        self._p0 = p0
+        return self.p0
+
+    @classmethod
+    def n_independent_samples(cls, fp):
+        """Returns the number of independent samples stored in a file.
+
+        The number of independent samples are counted starting from after
+        burn-in. If the sampler hasn't burned in yet, then 0 is returned.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            An open file handler to read.
+
+        Returns
+        -------
+        int
+            The number of independent samples.
+        """
+        # check if burned in
+        if not fp.is_burned_in:
+            return 0
+        # we'll just read a single parameter from the file
+        samples = cls.read_samples(fp, fp.variable_params[0])
+        return samples.size
+
+    @classmethod
+    def compute_acfs(cls, fp, start_index=None, end_index=None,
+                     per_walker=False, walkers=None, parameters=None):
+        """Computes the autocorrleation function of the model params in the
+        given file.
+
+        By default, parameter values are averaged over all walkers at each
+        iteration. The ACF is then calculated over the averaged chain. An
+        ACF per-walker will be returned instead if ``per_walker=True``.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            An open file handler to read the samples from.
+        start_index : {None, int}
+            The start index to compute the acl from. If None, will try to use
+            the number of burn-in iterations in the file; otherwise, will start
+            at the first sample.
+        end_index : {None, int}
+            The end index to compute the acl to. If None, will go to the end
+            of the current iteration.
+        per_walker : optional, bool
+            Return the ACF for each walker separately. Default is False.
+        walkers : optional, int or array
+            Calculate the ACF using only the given walkers. If None (the
+            default) all walkers will be used.
+        parameters : optional, str or array
+            Calculate the ACF for only the given parameters. If None (the
+            default) will calculate the ACF for all of the model params.
+
+        Returns
+        -------
+        FieldArray
+            A ``FieldArray`` of the ACF vs iteration for each parameter. If
+            `per-walker` is True, the FieldArray will have shape
+            ``nwalkers x niterations``.
+        """
+        acfs = {}
+        if parameters is None:
+            parameters = fp.variable_params
+        if isinstance(parameters, str) or isinstance(parameters, unicode):
+            parameters = [parameters]
+        for param in parameters:
+            if per_walker:
+                # just call myself with a single walker
+                if walkers is None:
+                    walkers = numpy.arange(fp.nwalkers)
+                arrays = [cls.compute_acfs(fp, start_index=start_index,
+                                           end_index=end_index,
+                                           per_walker=False, walkers=ii,
+                                           parameters=param)[param]
+                          for ii in walkers]
+                acfs[param] = numpy.vstack(arrays)
+            else:
+                samples = cls.read_samples(fp, param,
+                                           thin_start=start_index,
+                                           thin_interval=1, thin_end=end_index,
+                                           walkers=walkers,
+                                           flatten=False)[param]
+                samples = samples.mean(axis=0)
+                acfs[param] = autocorrelation.calculate_acf(samples).numpy()
+        return FieldArray.from_kwargs(**acfs)
+
+    @classmethod
+    def compute_acls(cls, fp, start_index=None, end_index=None):
+        """Computes the autocorrleation length for all model params in the
+        given file.
+
+        Parameter values are averaged over all walkers at each iteration.
+        The ACL is then calculated over the averaged chain. If the returned ACL
+        is `inf`,  will default to the number of current iterations.
+
+        Parameters
+        -----------
+        fp : InferenceFile
+            An open file handler to read the samples from.
+        start_index : {None, int}
+            The start index to compute the acl from. If None, will try to use
+            the number of burn-in iterations in the file; otherwise, will start
+            at the first sample.
+        end_index : {None, int}
+            The end index to compute the acl to. If None, will go to the end
+            of the current iteration.
+
+        Returns
+        -------
+        dict
+            A dictionary giving the ACL for each parameter.
+        """
+        acls = {}
+        for param in fp.variable_params:
+            samples = cls.read_samples(fp, param,
+                                       thin_start=start_index,
+                                       thin_interval=1, thin_end=end_index,
+                                       flatten=False)[param]
+            samples = samples.mean(axis=0)
+            acl = autocorrelation.calculate_acl(samples)
+            if numpy.isinf(acl):
+                acl = samples.size
+            acls[param] = acl
+        return acls
+
+    @staticmethod
+    def write_acls(fp, acls):
+        """Writes the given autocorrelation lengths to the given file.
+
+        The ACL of each parameter is saved to ``fp['acls/{param}']``.
+        The maximum over all the parameters is saved to the file's 'acl'
+        attribute.
+
+        Parameters
+        ----------
+        fp : InferenceFile
+            An open file handler to write the samples to.
+        acls : dict
+            A dictionary of ACLs keyed by the parameter.
+
+        Returns
+        -------
+        ACL
+            The maximum of the acls that was written to the file.
+        """
+        group = 'acls/{}'
+        # write the individual acls
+        for param in acls:
+            try:
+                # we need to use the write_direct function because it's
+                # apparently the only way to update scalars in h5py
+                fp[group.format(param)].write_direct(numpy.array(acls[param]))
+            except KeyError:
+                # dataset doesn't exist yet
+                fp[group.format(param)] = acls[param]
+        # write the maximum over all params
+        fp.attrs['acl'] = numpy.array(acls.values()).max()
+        return fp.attrs['acl']
+
+    @staticmethod
+    def read_acls(fp):
+        """Reads the acls of all the parameters in the given file.
+
+        Parameters
+        ----------
+        fp : InferenceFile
+            An open file handler to read the acls from.
+
+        Returns
+        -------
+        dict
+            A dictionary of the ACLs, keyed by the parameter name.
+        """
+        group = fp['acls']
+        return {param: group[param].value for param in group.keys()}
+
+
+class MCMCBurnInSupport(object):
+    """Provides methods for estimating burn-in."""
+
+    def write_burn_in_iterations(fp, burn_in_iterations, is_burned_in=None):
+        """Writes the burn in iterations to the given file.
+
+        Parameters
+        ----------
+        fp : InferenceFile
+            A file handler to an open inference file.
+        burn_in_iterations : array
+            Array of values giving the iteration of the burn in of each walker.
+        is_burned_in : array
+            Array of booleans indicating which chains are burned in.
+        """
+        try:
+            fp['burn_in_iterations'][:] = burn_in_iterations
+        except KeyError:
+            fp['burn_in_iterations'] = burn_in_iterations
+        fp.attrs['burn_in_iterations'] = burn_in_iterations.max()
+        if is_burned_in is not None:
+            try:
+                fp['is_burned_in'][:] = is_burned_in
+            except KeyError:
+                fp['is_burned_in'] = is_burned_in
+            fp.attrs['is_burned_in'] = is_burned_in.all()
+

From 3d75cab34acce61a9ba1635eee37895e4c7d8431 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Mon, 16 Jul 2018 14:26:01 -0400
Subject: [PATCH 14/47] add write_metadata to models

---
 gwin/io/base_hdf.py           | 101 +++++++++++-----------------------
 gwin/models/base.py           |   8 +++
 gwin/models/base_data.py      |  10 ++++
 gwin/models/gaussian_noise.py |  18 ++++++
 4 files changed, 67 insertions(+), 70 deletions(-)

diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index 6bd7864..413ced4 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -229,6 +229,7 @@ def cmd(self):
             cmd = cmd[-1]
         return cmd
 
+
     def write_metadata(self, sampler, **kwargs):
         """Writes the sampler's metadata.
 
@@ -243,24 +244,12 @@ def write_metadata(self, sampler, **kwargs):
             key is then stored as a separate attr with its corresponding value.
         """
         self.attrs['sampler'] = samlper.name
-        self.attrs['model'] = sampler.model.name
-        self.attrs['variable_params'] = list(sampler.variable_params)
-        self.attrs['sampling_params'] = list(sampler.sampling_params)
+        # write the model's metadata
+        sampler.model.write_metadata(self)
+        write_kwargs_to_hdf_attrs(self.attrs, **kwargs)
         # FIXME: what will write this?
         #fp.attrs["lognl"] = self.model.lognl
         # add the static params to the kwargs
-        kwargs['static_params'] = sampler.static_params
-        for arg, val in kwargs.items():
-            if val is None:
-                val = str(None)
-            if isinstance(val, dict):
-                self.attrs[arg] = val.keys()
-                for key, item in val.items():
-                    if item is None:
-                        item = str(None)
-                    self.attrs[key] = item
-            else:
-                self.attrs[arg] = val
 
     def write_logevidence(self, lnz, dlnz):
         """Writes the given log evidence and its error.
@@ -336,11 +325,6 @@ def read_random_state(self, group=None):
         cached_gauss = self[dataset_name].attrs["cached_gauss"]
         return s, arr, pos, has_gauss, cached_gauss
 
-    def load_random_state(self):
-        """Sets numpy's random state using what is saved in the file.
-        """
-        numpy.random.set_state(self.read_random_state())
-
     def write_strain(self, strain_dict, group=None):
         """Writes strain for each IFO to file.
 
@@ -384,73 +368,25 @@ def write_stilde(self, stilde_dict, group=None):
             self[group.format(ifo=ifo)].attrs['delta_f'] = stilde.delta_f
             self[group.format(ifo=ifo)].attrs['epoch'] = float(stilde.epoch)
 
-    def write_psd(self, psds, low_frequency_cutoff, group=None):
+    def write_psd(self, psds, group=None):
         """Writes PSD for each IFO to file.
 
         Parameters
         -----------
         psds : {dict, FrequencySeries}
             A dict of FrequencySeries where the key is the IFO.
-        low_frequency_cutoff : {dict, float}
-            A dict of the low-frequency cutoff where the key is the IFO. The
-            minimum value will be stored as an attr in the File.
         group : {None, str}
-            The group to write the strain to. If None, will write to the top
-            level.
+            The group to write the psd to. Default is ``data_group``.
         """
         subgroup = self.data_group + "/{ifo}/psds/0"
         if group is None:
             group = subgroup
         else:
             group = '/'.join([group, subgroup])
-        self.attrs["low_frequency_cutoff"] = min(low_frequency_cutoff.values())
         for ifo in psds:
             self[group.format(ifo=ifo)] = psds[ifo]
             self[group.format(ifo=ifo)].attrs['delta_f'] = psds[ifo].delta_f
 
-    def write_data(self, strain_dict=None, stilde_dict=None,
-                   psd_dict=None, low_frequency_cutoff_dict=None,
-                   group=None):
-        """Writes the strain/stilde/psd.
-
-        Parameters
-        ----------
-        strain_dict : {None, dict}
-            A dictionary of strains. If None, no strain will be written.
-        stilde_dict : {None, dict}
-            A dictionary of stilde. If None, no stilde will be written.
-        psd_dict : {None, dict}
-            A dictionary of psds. If None, no psds will be written.
-        low_freuency_cutoff_dict : {None, dict}
-            A dictionary of low frequency cutoffs used for each detector in
-            `psd_dict`; must be provided if `psd_dict` is not None.
-        group : {None, str}
-            The group to write the strain to. If None, will write to the top
-            level.
-        """
-        # save PSD
-        if psd_dict is not None:
-            if low_frequency_cutoff_dict is None:
-                raise ValueError("must provide low_frequency_cutoff_dict if "
-                                 "saving psds to output")
-            # apply dynamic range factor for saving PSDs since
-            # plotting code expects it
-            psd_dyn_dict = {}
-            for key, val in psd_dict.iteritems():
-                psd_dyn_dict[key] = FrequencySeries(val*DYN_RANGE_FAC**2,
-                                                    delta_f=val.delta_f)
-            self.write_psd(psds=psd_dyn_dict,
-                           low_frequency_cutoff=low_frequency_cutoff_dict,
-                           group=group)
-
-        # save stilde
-        if stilde_dict is not None:
-            self.write_stilde(stilde_dict, group=group)
-
-        # save strain if desired
-        if strain_dict is not None:
-            self.write_strain(strain_dict, group=group)
-
     def write_injections(self, injection_file):
         """Writes injection parameters from the given injection file.
 
@@ -668,6 +604,31 @@ def copy(self, other, ignore=None, parameters=None, parameter_names=None,
         #return other
 
 
+def write_kwargs_to_hdf_attrs(attrs, **kwargs):
+    """Writes the given keywords to the given ``attrs``.
+    
+    If any keyword argument points to a dict, the keyword will point to a
+    list of the dict's keys. Each key is then written to the attrs with its
+    corresponding value.
+
+    Parameters
+    ----------
+    attrs : an HDF attrs
+        Can be either the ``attrs`` of the hdf file, or any group in a file.
+    \**kwargs :
+        The keywords to write.
+    """
+    for arg, val in kwargs.items():
+        if val is None:
+            val = str(None)
+        if isinstance(val, dict):
+            attrs[arg] = val.keys()
+            # just call self again with the dict as kwargs
+            write_kwargs_to_hdf_attrs(attrs, **val)
+        else:
+            attrs[arg] = val
+
+
 def check_integrity(filename):
     """Checks the integrity of an InferenceFile.
 
diff --git a/gwin/models/base.py b/gwin/models/base.py
index f4c4378..d5a3d5e 100644
--- a/gwin/models/base.py
+++ b/gwin/models/base.py
@@ -34,6 +34,7 @@
 from pycbc.io import FieldArray
 from pycbc.workflow import ConfigParser
 
+from gwin.io.base_hdf import write_kwargs_to_hdf_attrs
 
 #
 # =============================================================================
@@ -742,3 +743,10 @@ def from_config(cls, cp, **kwargs):
         args['sampling_transforms'] = sampling_transforms
         args.update(kwargs)
         return cls(**args)
+
+    def write_metadata(self, fp):
+        """Writes metadata to the given file handler."""
+        fp.attrs['model'] = sampler.model.name
+        fp.attrs['variable_params'] = list(self.variable_params)
+        fp.attrs['sampling_params'] = list(self.sampling_params)
+        write_kwargs_to_hdf_attrs(fp.attrs, static_params=self.static_params)
diff --git a/gwin/models/base_data.py b/gwin/models/base_data.py
index 0c2095e..b15327f 100644
--- a/gwin/models/base_data.py
+++ b/gwin/models/base_data.py
@@ -150,6 +150,11 @@ def data(self):
         """Returns the data that was set."""
         return self._data
 
+    @property
+    def detectors(self):
+        """Returns the detectors used."""
+        return self._data.keys()
+
     def _transform_params(self, **params):
         """Adds waveform transforms to parent's ``_transform_params``."""
         params = super(BaseDataModel, self)._transform_params(**params)
@@ -231,3 +236,8 @@ def from_config(cls, cp, data, delta_f=None, delta_t=None,
         args['waveform_generator'] = waveform_generator
 
         return cls(**args)
+
+    def write_metadata(self, fp):
+        """Adds data to the metadata that's written."""
+        super(BaseDataModel, self).write_metadata(fp)
+        fp.write_stilde(self.data)
diff --git a/gwin/models/gaussian_noise.py b/gwin/models/gaussian_noise.py
index a2279de..c04dd4c 100644
--- a/gwin/models/gaussian_noise.py
+++ b/gwin/models/gaussian_noise.py
@@ -244,6 +244,7 @@ def __init__(self, variable_params, data, waveform_generator,
         d = data.values()[0]
         N = len(d)
         # figure out the kmin, kmax to use
+        self._f_lower = f_lower
         kmin, kmax = filter.get_cutoff_indices(f_lower, f_upper, d.delta_f,
                                                (N-1)*2)
         self._kmin = kmin
@@ -252,9 +253,12 @@ def __init__(self, variable_params, data, waveform_generator,
             norm = 4*d.delta_f
         # we'll store the weight to apply to the inner product
         if psds is None:
+            self._psds = None
             w = Array(numpy.sqrt(norm)*numpy.ones(N))
             self._weight = {det: w for det in data}
         else:
+            # store a copy of the psds
+            self._psds = {ifo: d.copy() for (ifo, d) in psds.items()}
             # temporarily suppress numpy divide by 0 warning
             numpysettings = numpy.seterr(divide='ignore')
             self._weight = {det: Array(numpy.sqrt(norm/psds[det]))
@@ -432,3 +436,17 @@ def det_optimal_snrsq(self, det):
             self.loglr
             # now try returning again
             return getattr(self._current_stats, '{}_optimal_snrsq'.format(det))
+
+    def write_metadata(self, fp):
+        """Adds writing the psds and lognl, since it's a constant.
+        
+        The lognl is written to the sample group's ``attrs``.
+        """
+        super(GaussianNoise, self).write_data(fp)
+        self.attrs['f_lower'] = self._f_lower
+        if self._psds is not None:
+            fp.write_psd(self, self._psds)
+        attrs = fp[fp.samples_group].attrs
+        attrs['lognl'] = self.lognl
+        for det in self.detectors:
+            attrs['{}_lognl'.format(det)] = self.det_lognl(det)

From f81edab1911c8c1dcb6066f4329b423e79a202a2 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Mon, 16 Jul 2018 18:15:03 -0400
Subject: [PATCH 15/47] move setting up checkpoint and run interval to sampler
 methods

---
 bin/gwin                  | 306 ++++++++------------------------------
 gwin/io/base_hdf.py       |   4 -
 gwin/sampler/base.py      | 122 ++++++++++++++-
 gwin/sampler/base_mcmc.py | 250 +++++++++++++++++++++++++++----
 4 files changed, 403 insertions(+), 279 deletions(-)

diff --git a/bin/gwin b/bin/gwin
index 2d0439b..6cf268d 100644
--- a/bin/gwin
+++ b/bin/gwin
@@ -39,11 +39,39 @@ from gwin.calibration import Recalibrate
 # command line usage
 parser = argparse.ArgumentParser(usage=__file__ + " [--options]",
                                  description=__doc__)
-
-# version option
 parser.add_argument("--version", action="version", version=__version__,
                     help="Prints version information.")
-
+parser.add_argument("--verbose", action="store_true", default=False,
+                    help="Print logging messages.")
+# output options
+parser.add_argument("--output-file", type=str, required=True,
+                    help="Output file path.")
+parser.add_argument("--force", action="store_true", default=False,
+                    help="If the output-file already exists, overwrite it. "
+                         "Otherwise, an OSError is raised.")
+parser.add_argument("--save-backup", action="store_true",
+                    default=False,
+                    help="Don't delete the backup file after the run has "
+                         "completed.")
+# run duration options
+parser.add_argument("--nsamples", type=int, required=True,
+                    help="The number of samples the sampler should get. "
+                         "The sampler will run until it has acquired at least "
+                         "this many samples. Depending on checkpoint settings "
+                         "it may go over.")
+parser.add_argument("--require-indep-samples", action="store_true",
+                    default=False,
+                    help="Require that the number of samples set by nsamples "
+                         "be independent. If this is not set, MCMC samplers "
+                         "will just run until they have the desried number of "
+                         "raw samples (with no thinning).")
+parser.add_argument("--samples-file", default=None,
+                    help="Use an iteration from an InferenceFile as the "
+                         "initial proposal distribution. The same "
+                         "number of walkers and the same [variable_params] "
+                         "section in the configuration file should be used. "
+                         "The priors must allow encompass the initial "
+                         "positions from the InferenceFile being read.")
 # add data options
 parser.add_argument("--instruments", type=str, nargs="+",
                     help="IFOs, eg. H1 L1.")
@@ -57,57 +85,8 @@ parser.add_argument("--psd-end-time", type=float, default=None,
 parser.add_argument("--seed", type=int, default=0,
                     help="Seed to use for the random number generator that "
                          "initially distributes the walkers. Default is 0.")
-parser.add_argument("--samples-file", default=None,
-                    help="Use an iteration from an InferenceFile as the "
-                         "initial proposal distribution. The same "
-                         "number of walkers and the same [variable_params] "
-                         "section in the configuration file should be used. "
-                         "The priors must allow encompass the initial "
-                         "positions from the InferenceFile being read.")
-
-# add sampler options
-option_utils.add_sampler_option_group(parser)
-
 # add config options
 option_utils.add_config_opts_to_parser(parser)
-
-# output options
-parser.add_argument("--output-file", type=str, required=True,
-                    help="Output file path.")
-parser.add_argument("--force", action="store_true", default=False,
-                    help="If the output-file already exists, overwrite it. "
-                         "Otherwise, an OSError is raised.")
-parser.add_argument("--save-strain", action="store_true", default=False,
-                    help="Save the conditioned strain time series to the "
-                         "output file. If gate-overwhitened, this is done "
-                         "before all gates have been applied.")
-parser.add_argument("--save-stilde", action="store_true", default=False,
-                    help="Save the conditioned strain frequency series to "
-                         "the output file. This is done after all gates have "
-                         "been applied.")
-parser.add_argument("--save-psd", action="store_true", default=False,
-                    help="Save the psd of each ifo to the output file.")
-parser.add_argument("--checkpoint-interval", type=int, default=None,
-                    help="Number of iterations to take before saving new "
-                         "samples to file, calculating ACL, and updating "
-                         "burn-in estimate.")
-parser.add_argument("--resume-from-checkpoint", action="store_true",
-                    default=False,
-                    help="Automatically load results from checkpoint/backup "
-                         "file.")
-parser.add_argument("--save-backup", action="store_true",
-                    default=False,
-                    help="Don't delete the backup file after the run has "
-                         "completed.")
-parser.add_argument("--checkpoint-fast", action="store_true",
-                    help="Do not calculate ACL after each checkpoint, only at "
-                         "the end. Not applicable if n-independent-samples "
-                         "have been specified.")
-
-# verbose option
-parser.add_argument("--verbose", action="store_true", default=False,
-                    help="Print logging messages.")
-
 # add module pre-defined options
 fft.insert_fft_option_group(parser)
 opt.insert_optimization_option_group(parser)
@@ -131,41 +110,6 @@ scheme.verify_processing_options(opts, parser)
 #strain.verify_strain_options(opts, parser)
 weave.verify_weave_options(opts, parser)
 
-# check for the output file
-if os.path.exists(opts.output_file) and not opts.force:
-    raise OSError("output-file already exists; use --force if you wish to "
-                  "overwrite it.")
-
-# check for backup file(s)
-checkpoint_file = opts.output_file + '.checkpoint'
-backup_file = opts.output_file + '.bkup'
-checkpoint_valid = validate_checkpoint_files(checkpoint_file, backup_file)
-
-# determine what to do with checkpoints
-if checkpoint_valid and not opts.resume_from_checkpoint and not opts.force:
-    raise OSError("valid checkpoint file {} found, but "
-                  "resume-from-checkpoint not on. If you wish to overwrite "
-                  "use --force; otherwise, use --resume-from-checkpoint")
-if not opts.resume_from_checkpoint and opts.force:
-    checkpoint_valid = False
-
-# check for how many iterations to run
-max_iterations = opts.niterations
-if opts.niterations is not None and opts.n_independent_samples is not None:
-    raise ValueError("Must specify either niterations or n-independent-"
-                     "samples, not both")
-elif opts.niterations is not None:
-    get_nsamples = opts.niterations
-elif opts.n_independent_samples is not None:
-    if opts.checkpoint_interval is None:
-        raise ValueError("n-independent-samples requires a checkpoint-"
-                         "interval; see help")
-    get_nsamples = opts.n_independent_samples
-else:
-    raise ValueError("Must specify niterations or n-independent-samples; "
-                     "see --help")
-
-
 # set seed
 numpy.random.seed(opts.seed)
 logging.info("Using seed %i", opts.seed)
@@ -218,41 +162,22 @@ with ctx:
 
     logging.info("Setting up sampler")
 
-    # create sampler that will run
-    sampler = option_utils.sampler_from_cli(opts, model)
-
-    # save information about this data and settings
-    if not checkpoint_valid:
-        with InferenceFile(checkpoint_file, "w") as fp:
-            # save command line and data
-            logging.info("Creating and writing data to output file")
-            fp.write_data(
-                strain_dict=strain_dict if opts.save_strain else None,
-                stilde_dict=stilde_dict if opts.save_stilde else None,
-                psd_dict=psd_dict if opts.save_psd else None,
-                low_frequency_cutoff_dict=low_frequency_cutoff_dict)
-
-            # save injection parameters
-            if opts.injection_file:
-                for ifo in opts.instruments:
-                    logging.info("Writing %s injections to output file", ifo)
-                    if ifo in opts.injection_file.keys():
-                        inj_file = opts.injection_file[ifo]
-                    elif len(opts.injection_file) == 1:
-                        inj_file = opts.injection_file.values()[0]
-                    else:
-                        logging.warn("Could not find injections for %s", ifo)
-                        continue
-                    fp.write_injections(opts.injection_file.values()[0], ifo)
-        # copy to backup
-        shutil.copy(checkpoint_file, backup_file)
-
-    # write the command line, resume point
-    for fn in [checkpoint_file, backup_file]:
-        with InferenceFile(fn, "a") as fp:
-            fp.write_command_line()
-            if checkpoint_valid:
-                fp.write_resume_point()
+    # Create sampler that will run.
+    # Note: the pool is created at this point. This means that,
+    # unless you enjoy angering your cluster admins,
+    # NO SAMPLES FILE IO SHOULD BE DONE PRIOR TO THIS POINT!!!
+    sampler = gwin.sampler.load_from_config(
+        cp, model, nprocesses=opts.nprocesses, use_mpi=opts.use_mpi)
+
+    # set up output/checkpoint file
+    # Note: PyCBC's multi-ifo parser uses key:ifo for
+    # the injection file, even though we will use the same
+    # injection file all detectors. This
+    # should be fixed in a future version of PyCBC. Once it is,
+    # update this. Until then, just use the first file.
+    injection_file = opts.injection_file.values()[0]  # None if not set
+    sampler.setup_output(opts.output_file, force=opts.force,
+                         injection_file=injetion_file)
 
     # set the walkers initial positions from a pre-existing InferenceFile
     # or a specific initial distribution listed in the configuration file
@@ -260,12 +185,12 @@ with ctx:
     logging.info("Setting walkers initial conditions for varying parameters")
     samples_file = opts.samples_file
     # use the checkpoint file instead if resume from checkpoint
-    if opts.resume_from_checkpoint and checkpoint_valid:
-        samples_file = checkpoint_file
+    if sampler.checkpoint_valid:
+        samples_file = sampler.checkpoint_file
     if samples_file is not None:
         logging.info("Initial positions taken from last iteration in %s",
                      samples_file)
-        samples_file = InferenceFile(samples_file, "r")
+        samples_file = sampler.io(samples_file, "r")
         init_prior = None
     elif len(cp.get_subsections("initial")):
         initial_dists = distributions.read_distributions_from_config(
@@ -276,127 +201,18 @@ with ctx:
             *initial_dists, **{"constraints" : constraints})
     else:
         init_prior = None
-    sampler.set_p0(samples_file=samples_file, prior=init_prior)
 
-    # if getting samples from file then put sampler and random number generator
-    # back in its former state
-    if samples_file is not None:
-        sampler.set_state_from_file(samples_file)
-        samples_file.close()
-
-    # run sampler's burn in if it is in the list of burn in functions
-    if "use_sampler" in burn_in_eval.burn_in_functions:
-        # remove the sampler's burn in so we don't run more than once
-        burn_in_eval.burn_in_functions.pop("use_sampler")
-        # we'll only do this if we don't have a valid checkpoint: since the
-        # checkpoint happens after the sampler's burn in, the sampler's burn in
-        # must have already run if we have a valid checkpoint file
-        if not checkpoint_valid:
-            with InferenceFile(checkpoint_file, "a") as fp:
-                logging.info("Running sampler's burn in function")
-                burnidx, is_burned_in = burn_in.use_sampler(sampler, fp)
-                sampler.write_burn_in_iterations(fp, burnidx, is_burned_in)
-                # write the burn in results
-                logging.info("Writing burn in samples to file")
-                sampler.write_results(fp, static_params=model.static_params,
-                                      ifos=opts.instruments)
-            # write to backup file
-            with InferenceFile(backup_file, "a") as fp:
-                sampler.write_burn_in_iterations(fp, burnidx, is_burned_in)
-                sampler.write_results(fp, static_params=model.static_params,
-                                      ifos=opts.instruments)
-
-
-    # get the starting number of samples:
-    # nsamples keeps track of the number of samples we've obtained (if
-    # --n-independent-samples is used, this is the number of independent
-    # samples; otherwise, this is the number of iterations);
-    # start is the number of iterations that the file already contains (either
-    # due to sampler burn-in, or a previous checkpoint)
-    try:
-        with InferenceFile(checkpoint_file, "r") as fp:
-            start = fp.niterations
-    except KeyError:
-        start = 0
-    if opts.n_independent_samples is not None:
-        try:
-            with InferenceFile(checkpoint_file, "r") as fp:
-                nsamples = fp.n_independent_samples
-        except AttributeError:
-            nsamples = start
-    else:
-        nsamples = start
-    # to ensure iterations are counted properly, he sampler's lastclear should
-    # be the same as start
-    sampler.lastclear = start
-
-    interval = opts.checkpoint_interval
-    if interval is None:
-        interval = get_nsamples
-
-    # run sampler until we have the desired number of samples
-    while nsamples < get_nsamples:
-
-        end = start + interval
-
-        # adjust the interval if we would go past the number of iterations
-        if opts.n_independent_samples is None and end > get_nsamples:
-            interval = get_nsamples - start
-            end = start + interval
-
-        # run sampler and set initial values to None so that sampler
-        # picks up from where it left off next call
-        logging.info("Running sampler for {} to {} iterations".format(start,
-                                                                      end))
-        sampler.run(interval)
-
-        # write new samples
-        with InferenceFile(checkpoint_file, "a") as fp:
-
-            logging.info("Writing results to file")
-            sampler.write_results(fp, static_params=model.static_params,
-                                  ifos=opts.instruments)
-            logging.info("Updating burn in")
-            burnidx, is_burned_in = burn_in_eval.update(sampler, fp)
-
-            # compute the acls and write
-            acls = None
-            if opts.n_independent_samples is not None or end >= get_nsamples \
-                    or not opts.checkpoint_fast:
-                logging.info("Computing acls")
-                acls = sampler.compute_acls(fp)
-                sampler.write_acls(fp, acls)
-
-        # write to backup
-        with InferenceFile(backup_file, "a") as fp:
-
-            logging.info("Writing to backup file")
-            sampler.write_results(fp, static_params=model.static_params,
-                                  ifos=opts.instruments)
-            sampler.write_burn_in_iterations(fp, burnidx, is_burned_in)
-            if acls is not None:
-                sampler.write_acls(fp, acls)
-
-        # check validity
-        checkpoint_valid = validate_checkpoint_files(checkpoint_file,
-                                                     backup_file)
-        if not checkpoint_valid:
-            raise IOError("error writing to checkpoint file")
-
-        # update nsamples for next loop
-        if opts.n_independent_samples is not None:
-            with InferenceFile(checkpoint_file, 'r') as fp:
-                nsamples = fp.n_independent_samples
-            logging.info("Have {} independent samples".format(nsamples))
-        else:
-            nsamples += interval
-
-
-        # clear the in-memory chain to save memory
-        logging.info("Clearing chain")
-        sampler.clear_chain()
-
-        start = end
+    sampler.set_initial_conditions(intial_distribution=init_prior,
+        samples_file=samples_file)
+
+    # Set the target number of samples for the sampler
+    sampler.set_target(opts.nsamples, opts.require_indep_samples)
+
+    # Run the sampler
+    sampler.run()
+
+    # finalize and exit
+    sampler.finalize()
 
     # compute evidence, if supported
     with InferenceFile(checkpoint_file, 'a') as fp:
diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index 413ced4..2601a93 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -229,7 +229,6 @@ def cmd(self):
             cmd = cmd[-1]
         return cmd
 
-
     def write_metadata(self, sampler, **kwargs):
         """Writes the sampler's metadata.
 
@@ -247,9 +246,6 @@ def write_metadata(self, sampler, **kwargs):
         # write the model's metadata
         sampler.model.write_metadata(self)
         write_kwargs_to_hdf_attrs(self.attrs, **kwargs)
-        # FIXME: what will write this?
-        #fp.attrs["lognl"] = self.model.lognl
-        # add the static params to the kwargs
 
     def write_logevidence(self, lnz, dlnz):
         """Writes the given log evidence and its error.
diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py
index 38b5029..12dcc24 100644
--- a/gwin/sampler/base.py
+++ b/gwin/sampler/base.py
@@ -57,7 +57,8 @@ def __init__(self, model):
 
     #@classmethod # uncomment when we move to python 3.3
     @abstractmethod
-    def from_config(cls, cp, model, pool=None, model_call=None, **kwargs):
+    def from_config(cls, cp, model, nprocesses=1, use_mpi=False,
+                    **kwargs):
         """This should initialize the sampler given a config file.
         """
         pass
@@ -81,9 +82,9 @@ def static_params(self):
         return self.model.static_params
 
     @abstractproperty
-    def raw_samples(self):
-        """A dict mapping sampling_params to arrays of samples currently
-        in memory.
+    def samples(self):
+        """A dict mapping variable_params to arrays of samples currently
+        in memory. The dictionary may also contain sampling_params.
         
         The sample arrays may have any shape, and may or may not be thinned.
         """
@@ -116,9 +117,122 @@ def io(self):
         """
         pass
 
+    @abstractmethod
+    def set_initial_conditions(self, initial_distribution=None,
+                               samples_file=None):
+        """Sets up the starting point for the sampler.
+        
+        Should also set the sampler's random state.
+        """
+        pass
+
     @abstractmethod
     def checkpoint(self):
         """The sampler must have a checkpoint method for dumping raw samples
         and stats to the file type defined by ``io``.
         """
         pass
+
+    def setup_output(self, output_file, force=False, injection_file=None):
+        """Sets up the sampler's checkpoint and output files.
+
+        The checkpoint file has the same name as the output file, but with
+        ``.checkpoint`` appended to the name. A backup file will also be
+        created.
+
+        If the output file already exists, an ``OSError`` will be raised.
+        This can be overridden by setting ``force`` to ``True``.
+        
+        Parameters
+        ----------
+        sampler : sampler instance
+            Sampler
+        output_file : str
+            Name of the output file.
+        force : bool, optional
+            If the output file already exists, overwrite it.
+        injection_file : str, optional
+            If an injection was added to the data, write its information.
+        """
+        # check for backup file(s)
+        checkpoint_file = output_file + '.checkpoint'
+        backup_file = output_file + '.bkup'
+        # check if we have a good checkpoint and/or backup file
+        checkpoint_valid = validate_checkpoint_files(checkpoint_file,
+                                                     backup_file)
+        # Create a new file if the checkpoint doesn't exist, or if it is
+        # corrupted
+        if not checkpoint_valid:
+            self.create_new_output_file(checkpoint_file, force=force,
+                                        injection_file=injection_file)
+            # now the checkpoint is valid
+            checkpoint_valid = True
+            # copy to backup
+            shutil.copy(checkpoint_file, backup_file)
+        # write the command line
+        for fn in [checkpoint_file, backup_file]:
+            with sampler.io(fn, "a") as fp:
+                fp.write_command_line()
+        # store
+        self.checkpoint_file = checkpoint_file
+        self.backup_file = backup_file
+        self.checkpoint_valid = checkpoint_valid
+
+    def set_target(self, nsamples, require_independent=False):
+        """Sets the number of samples the sampler should try to acquire.
+
+        If the ``must_be_independent`` flag is set, then the number of samples
+        must be independent. This means, for example, that MCMC chains are
+        thinned by their ACL before counting samples. Otherwise, the sampler
+        will just run until it has the requested number of samples, regardless
+        of thinning.
+
+        Parameters
+        ----------
+        nsamples : int
+            The number of samples to acquire.
+        must_be_independent : bool, optional
+            Add the requirement that the target number of samples be
+            independent. Default is False.
+        """
+        self.target_nsamples = nsamples
+        self.require_indep_samples = require_independent
+
+
+
+def create_new_output_file(sampler, filename, force=False, injection_file=None,
+                           **kwargs):
+    """Creates a new output file.
+
+    If the output file already exists, an ``OSError`` will be raised. This can
+    be overridden by setting ``force`` to ``True``.
+    
+    Parameters
+    ----------
+    sampler : sampler instance
+        Sampler
+    filename : str
+        Name of the file to create.
+    force : bool, optional
+        Create the file even if it already exists. Default is False.
+    injection_file : str, optional
+        If an injection was added to the data, write its information.
+    \**kwargs :
+        All other keyword arguments are passed through to the file's
+        ``write_metadata`` function.
+    """
+    if os.path.exists(filename):
+        if force:
+            os.remove(filename)
+        else:
+            raise OSError("output-file already exists; use force if you "
+                          "wish to overwrite it.")
+    logging.info("Creating file {}".format(filename))
+    with sampler.io(filename, "w") as fp:
+        # save the sampler's metadata
+        fp.write_metadata(sampler)
+        # save injection parameters
+        if injection_file is not None:
+            logging.info("Writing injection file to output")
+            # just use the first one
+            fp.write_injections(injection_file)
diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index 69beb75..65d4e86 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -21,10 +21,89 @@
 #
 # =============================================================================
 #
-"""Provides constructor classes for MCMC samplers."""
+"""Provides constructor classes and convenience functions for MCMC samplers."""
 
 from abc import ABCMeta, abstractmethod, abstractproperty
 
+#
+# =============================================================================
+#
+#                              Convenience functions
+#
+# =============================================================================
+#
+def raw_samples_to_dict(sampler, raw_samples):
+    """Convenience function for converting ND array to a dict of samples.
+
+    The samples are assumed to have dimension
+    ``[sampler.base_shape x] niterations x len(sampler.sampling_params)``.
+
+    Parameters
+    ----------
+    sampler : sampler instance
+        An instance of an MCMC sampler.
+    raw_samples : array
+        The array of samples to convert.
+
+    Returns
+    -------
+    dict :
+        A dictionary mapping the raw samples to the variable params. If the
+        sampling params are not the same as the variable params, they will
+        also be included. Each array will have shape
+        ``[sampler.base_shape x] niterations``.
+    """
+    sampling_params = sampler.sampling_params
+    # convert to dictionary
+    samples = {param: raw_samples[..., ii] for
+               ii, param in enumerate(sampling_params)}
+    # apply boundary conditions
+    samples = sampler.model.prior_distribution.apply_boundary_conditions(
+        **samples)
+    # apply transforms to go to model's variable params space
+    return sampler.model.sampling_transforms.apply(samples, inverse=True)
+
+
+def raw_stats_to_dict(sampler, raw_stats):
+    """Converts an ND array of model stats to a dict.
+
+    The ``raw_stats`` may either be a numpy array or a list. If the
+    former, the stats are assumed to have shape
+    ``[sampler.base_shape x] niterations x nstats, where nstats are the number
+    of stats returned by ``sampler.model.default_stats``. If the latter, the
+    list is cast to an array that is assumed to be the same shape as if an
+    array was given.
+
+    Parameters
+    ----------
+    sampler : sampler instance
+        An instance of an MCMC sampler.
+    raw_stats : array or list
+        The stats to convert.
+
+    Returns
+    -------
+    dict :
+        A dictionary mapping the model's ``default_stats`` to arrays of values.
+        Each array will have shape ``[sampler.base_shape x] niterations``.
+    """
+    if not isinstance(raw_stats, numpy.ndarray):
+        # Assume list. Since the model returns a tuple of values, this should
+        # be a [sampler.base_shape x] x niterations list of tuples. We can
+        # therefore immediately convert this to a ND array.
+        raw_stats = numpy.array(raw_stats)
+    return {stat: raw_stats[..., ii]
+            for (ii, stat) in enumerate(self.model.default_stats)}
+
+#
+# =============================================================================
+#
+#                              BaseMCMC definition
+#
+# =============================================================================
+#
+
+
 class BaseMCMC(object):
     """This class provides methods common to MCMCs.
 
@@ -43,15 +122,21 @@ class BaseMCMC(object):
     """
     __metaclass__ = ABCMeta
 
-    lastclear = None
+    _lastclear = None
     _itercounter = None
     _pos = None
     _p0 = None
     _nwalkers = None
 
     @abstractproperty(self):
-    def samples_shape(self):
-        """Should define what shape to expect samples to be in."""
+    def base_shape(self):
+        """What shape the sampler's samples arrays are in, excluding 
+        the iterations dimension.
+        
+        For example, if a sampler uses 20 walkers and 3 temperatures, this
+        would be ``(3, 20)``. If a sampler only uses a single walker and no
+        temperatures this would be ``()``.
+        """
         pass
 
     @property
@@ -67,7 +152,7 @@ def niterations(self):
         itercounter = self._itercounter
         if _itercounter is None:
             itercounter = 0
-        lastclear = self.lastclear
+        lastclear = self._lastclear
         if lastclear is None:
             lastclear = 0
         return itercounter + lastclear
@@ -119,11 +204,11 @@ def set_p0(self, samples_file=None, prior=None):
                 samples = fp.read_samples(self.variable_params,
                                           iteration=-1)
                 # make sure we have the same shape
-                assert(samples.shape == self.samples_shape,
+                assert(samples.shape[:-1] == self.samples_shape,
                        "samples in file {} have shape {}, but I have shape {}".
                        format(samples_file, samples.shape, self.samples_shape))
             # transform to sampling parameter space
-            samples = self.model.apply_sampling_transforms(samples)
+            samples = self.model.sampling_transforms.apply(samples)
         # draw random samples if samples are not provided
         else:
             nsamples = numpy.prod(self.samples_shape)
@@ -137,29 +222,142 @@ def set_p0(self, samples_file=None, prior=None):
         self._p0 = p0
         return self.p0
 
-    @classmethod
-    def n_independent_samples(cls, fp):
-        """Returns the number of independent samples stored in a file.
+    def set_initial_conditions(self, initial_distribution=None,
+                               samples_file=None):
+        """Sets the initial starting point for the MCMC.
 
-        The number of independent samples are counted starting from after
-        burn-in. If the sampler hasn't burned in yet, then 0 is returned.
+        If a starting samples file is provided, will also load the random
+        state from it.
+        """
+        self.set_p0(samples_file=samples_file, prior=initial_distribution)
+        # if a samples file was provided, use it to set the state of the
+        # sampler
+        if samples_file is not None:
+            self.set_state_from_file(samples_file)
 
-        Parameters
-        -----------
-        fp : InferenceFile
-            An open file handler to read.
+    @abstractmethod
+    def set_state_from_file(self, filename):
+        """Sets the state of the sampler to the instance saved in a file.
+        """
+        pass
 
-        Returns
-        -------
-        int
-            The number of independent samples.
+    @abstractmethod
+    def write_state(self, filename):
+        """Saves the state of the sampler to the given file.
         """
-        # check if burned in
-        if not fp.is_burned_in:
-            return 0
-        # we'll just read a single parameter from the file
-        samples = cls.read_samples(fp, fp.variable_params[0])
-        return samples.size
+        pass
+
+    def run(self):
+        """Runs the sampler."""
+
+        if self.require_indep_samples and self.checkpoint_interval is None:
+            raise ValueError("A checkpoint interval must be set if "
+                             "independent samples are required")
+        # figure out how many iterations I need to run for: this is the target
+        # number of samples / the number of walkers
+        target_niters = self.target_nsamples / self.nwalkers
+
+        # get the starting number of samples:
+        # "nsamples" keeps track of the number of samples we've obtained (if
+        # require_indep_samples is used, this is the number of independent
+        # samples; otherwise, this is the total number of samples).
+        # "startiter" is the number of iterations that the file already contains
+        # (either due to sampler burn-in, or a previous checkpoint)
+        try:
+            with self.io(self.checkpoint_file, "r") as fp:
+                start = fp.niterations
+        except KeyError:
+            startiter = 0
+        if self.require_indep_samples:
+            with self.io(self.checkpoint_file, "r") as fp:
+                nsamples = fp.n_independent_samples
+        else:
+            # the number of samples is the number of iterations times the
+            # number of walkers
+            nsamples = startiter * self.nwalkers
+
+        # to ensure iterations are counted properly, the sampler's lastclear
+        # should be the same as start
+        self._lastclear = startiter
+
+        iterinterval = self.checkpoint_interval
+        if iterinterval is None:
+            iterinterval = int(numpy.ceil(
+                float(self.target_nsamples) / self.nwalkers))
+
+        # run sampler until we have the desired number of samples
+        while nsamples < self.target_nsamples:
+
+            enditer = startiter + iterinterval
+
+            # adjust the interval if we would go past the number of iterations
+            endnsamp = enditer * self.nwalkers
+            if endnsamp > self.target_nsamples \
+                    and not self.require_indep_samples:
+                iterinterval = int(numpy.ceil(
+                    (endnsamp - self.target_nsamples) / self.nwalkers))
+
+            # run sampler and set initial values to None so that sampler
+            # picks up from where it left off next call
+            logging.info("Running sampler for {} to {} iterations".format(
+                startiter, enditer))
+            self.run_mcmc(iterinterval)
+
+            # update nsamples for next loop
+            if opts.n_independent_samples is not None:
+                with InferenceFile(checkpoint_file, 'r') as fp:
+                    nsamples = fp.n_independent_samples
+                logging.info("Have {} independent samples".format(nsamples))
+            else:
+                nsamples += interval
+
+
+            # clear the in-memory chain to save memory
+            logging.info("Clearing chain")
+            sampler.clear_chain()
+
+            start = end
+
+    @abstractmethod
+    def run_for_niterations(self, niterations):
+        """Run the MCMC for the given number of iterations."""
+        pass
+
+    def checkpoint(self):
+        """Dumps current samples to the checkpoint file."""
+        # write new samples
+        with self.io(checkpoint_file, "a") as fp:
+
+            logging.info("Writing samples to file")
+            sampler.write_results(fp, static_params=model.static_params,
+                                  ifos=opts.instruments)
+            logging.info("Updating burn in")
+            burnidx, is_burned_in = burn_in_eval.update(sampler, fp)
+
+            # compute the acls and write
+            acls = None
+            if opts.n_independent_samples is not None or end >= get_nsamples \
+                    or not opts.checkpoint_fast:
+                logging.info("Computing acls")
+                acls = sampler.compute_acls(fp)
+                sampler.write_acls(fp, acls)
+
+        # write to backup
+        with InferenceFile(backup_file, "a") as fp:
+
+            logging.info("Writing to backup file")
+            sampler.write_results(fp, static_params=model.static_params,
+                                  ifos=opts.instruments)
+            sampler.write_burn_in_iterations(fp, burnidx, is_burned_in)
+            if acls is not None:
+                sampler.write_acls(fp, acls)
+
+        # check validity
+        checkpoint_valid = validate_checkpoint_files(checkpoint_file,
+                                                     backup_file)
+        if not checkpoint_valid:
+            raise IOError("error writing to checkpoint file")
+
 
     @classmethod
     def compute_acfs(cls, fp, start_index=None, end_index=None,

From 2f9a2b2c3879ebbb6f6bfa0236057ee5fd8b5abb Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Mon, 16 Jul 2018 19:56:08 -0400
Subject: [PATCH 16/47] rearrange read/write functions; add checkpoint and
 finalize methods; add run method to base_mcmc

---
 gwin/io/base_hdf.py       |  22 +---
 gwin/io/base_mcmc.py      |  18 +--
 gwin/io/emcee.py          |  10 +-
 gwin/sampler/base.py      |  59 +++++++++-
 gwin/sampler/base_mcmc.py |  67 +++++------
 gwin/sampler/emcee.py     | 241 ++++++++++++++++----------------------
 6 files changed, 199 insertions(+), 218 deletions(-)

diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index 2601a93..39fd96a 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -204,11 +204,11 @@ def static_params(self):
         return {arg: self.attrs[arg] for arg in self.attrs["static_params"]}
 
     @property
-    def n_independent_samples(self):
+    def n_indep_samples(self):
         """Returns the number of independent samples stored in the file.
         """
         try:
-            return self.attrs['n_independent_samples']
+            return self.attrs['n_indep_samples']
         except KeyError:
             return 0
 
@@ -229,24 +229,6 @@ def cmd(self):
             cmd = cmd[-1]
         return cmd
 
-    def write_metadata(self, sampler, **kwargs):
-        """Writes the sampler's metadata.
-
-        Parameters
-        ----------
-        sampler : gwin.sampler
-            An instance of a gwin sampler.
-        **kwargs :
-            All keyword arguments are saved as separate arguments in the
-            file attrs. If any keyword argument is a dictionary, the keyword
-            will point to the list of keys in the the file's ``attrs``. Each
-            key is then stored as a separate attr with its corresponding value.
-        """
-        self.attrs['sampler'] = samlper.name
-        # write the model's metadata
-        sampler.model.write_metadata(self)
-        write_kwargs_to_hdf_attrs(self.attrs, **kwargs)
-
     def write_logevidence(self, lnz, dlnz):
         """Writes the given log evidence and its error.
 
diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py
index a597c6d..302aed3 100644
--- a/gwin/io/base_mcmc.py
+++ b/gwin/io/base_mcmc.py
@@ -51,18 +51,7 @@ def read_acls(self):
         """
         pass
 
-    def write_mcmc_metadata(self, sampler):
-        """Writes metadata unique to an ensemble MCMC.
-
-        Parameters
-        ----------
-        sampler : gwin.sampler
-            An instance of a gwin sampler.
-        """
-        self.attrs["niterations"] = sampler.niterations
-        self.attrs["nwalkers"] = sampler.nwalkers
-
-    def write_samples(self, parameters, samples,
+    def write_samples(self, samples, parameters=None,
                       start_iteration=None, max_iterations=None):
         """Writes samples to the given file.
 
@@ -75,11 +64,12 @@ def write_samples(self, parameters, samples,
 
         Parameters
         -----------
-        parameters : list
-            The parameters to write to the file.
         samples : dict
             The samples to write. Each array in the dictionary should have
             shape nwalkers x niterations.
+        parameters : list, optional
+            Only write the specified parameters to the file. If None, will
+            write all of the keys in the ``samples`` dict.
         start_iteration : int, optional
             Write results to the file's datasets starting at the given
             iteration. Default is to append after the last iteration in the
diff --git a/gwin/io/emcee.py b/gwin/io/emcee.py
index c127da5..2376c64 100644
--- a/gwin/io/emcee.py
+++ b/gwin/io/emcee.py
@@ -33,12 +33,10 @@ class EmceeFile(EnsembleMCMCIO, BaseInferenceFile):
     name = 'emcee_file'
 
     def read_acceptance_fraction(self, walkers=None):
-        """Reads the acceptance fraction from the given file.
+        """Reads the acceptance fraction.
 
         Parameters
         -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
         walkers : {None, (list of) int}
             The walker index (or a list of indices) to retrieve. If None,
             samples from all walkers will be obtained.
@@ -58,12 +56,12 @@ def read_acceptance_fraction(self, walkers=None):
 
     def write_acceptance_fraction(self, acceptance_fraction):
         """Write acceptance_fraction data to file. Results are written to
-        `fp[acceptance_fraction]`.
+        the ``[sampler_group]/acceptance_fraction``.
 
         Parameters
         -----------
-        fp : InferenceFile
-            A file handler to an open inference file.
+        acceptance_fraction : numpy.ndarray
+            Array of acceptance fractions to write.
         """
         group = self.sampler_group + '/acceptance_fraction'
         try:
diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py
index 12dcc24..1a2718b 100644
--- a/gwin/sampler/base.py
+++ b/gwin/sampler/base.py
@@ -36,7 +36,7 @@
 #
 # =============================================================================
 #
-#                                   Samplers
+#                           Base Sampler definition
 #
 # =============================================================================
 #
@@ -133,6 +133,24 @@ def checkpoint(self):
         """
         pass
 
+    @abstractmethod
+    def finalize(self):
+        """Do any finalization to the samples file before exiting."""
+        pass
+
+    def write_metadata(self, fp):
+        """Writes metadata about the sampler to the given filehandler."""
+        fp.attrs['sampler'] = self.name
+        # write the model's metadata
+        self.model.write_metadata(fp)
+        self._write_more_metadata(fp)
+        
+    def _write_more_metadata(self, fp):
+        """Optional method that can be implemented if a sampler wants to write
+        more metadata than just its name and the model's metadata.
+        """
+        pass
+
     def setup_output(self, output_file, force=False, injection_file=None):
         """Sets up the sampler's checkpoint and output files.
 
@@ -199,6 +217,13 @@ def set_target(self, nsamples, require_independent=False):
         self.require_indep_samples = require_independent
 
 
+#
+# =============================================================================
+#
+#                           Convenience functions
+#
+# =============================================================================
+#
 
 def create_new_output_file(sampler, filename, force=False, injection_file=None,
                            **kwargs):
@@ -230,9 +255,39 @@ def create_new_output_file(sampler, filename, force=False, injection_file=None,
     logging.info("Creating file {}".format(filename))
     with sampler.io(filename, "w") as fp:
         # save the sampler's metadata
-        fp.write_metadata(sampler)
+        sampler.write_metadata(fp)
         # save injection parameters
         if injection_file is not None:
             logging.info("Writing injection file to output")
             # just use the first one
             fp.write_injections(injection_file)
+
+def intial_dist_from_config(cp):
+    """Loads a distribution for the sampler start from the given config file.
+
+    A distribution will only be loaded if the config file has a [initial-*]
+    section(s).
+
+    Parameters
+    ----------
+    cp : Config parser
+        The config parser to try to load from.
+
+    Returns
+    -------
+    JointDistribution or None :
+        The initial distribution. If no [initial-*] section found in the
+        config file, will just return None.
+    """
+    if len(cp.get_subsections("initial")):
+        logging.info("Using a different distribution for the starting points "
+                     "than the prior.")
+        initial_dists = distributions.read_distributions_from_config(
+            cp, section="initial")
+        constraints = distributions.read_constraints_from_config(cp,
+            constraint_section="initial_constraint")
+        init_dist = distributions.JointDistribution(sampler.variable_params,
+            *initial_dists, **{"constraints" : constraints})
+    else:
+        init_dist = None
+    return init_dist
diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index 65d4e86..51f7ef8 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -24,6 +24,8 @@
 """Provides constructor classes and convenience functions for MCMC samplers."""
 
 from abc import ABCMeta, abstractmethod, abstractproperty
+import logging
+import numpy
 
 #
 # =============================================================================
@@ -253,10 +255,6 @@ def run(self):
         if self.require_indep_samples and self.checkpoint_interval is None:
             raise ValueError("A checkpoint interval must be set if "
                              "independent samples are required")
-        # figure out how many iterations I need to run for: this is the target
-        # number of samples / the number of walkers
-        target_niters = self.target_nsamples / self.nwalkers
-
         # get the starting number of samples:
         # "nsamples" keeps track of the number of samples we've obtained (if
         # require_indep_samples is used, this is the number of independent
@@ -270,83 +268,82 @@ def run(self):
             startiter = 0
         if self.require_indep_samples:
             with self.io(self.checkpoint_file, "r") as fp:
-                nsamples = fp.n_independent_samples
+                nsamples = fp.n_indep_samples
         else:
             # the number of samples is the number of iterations times the
             # number of walkers
             nsamples = startiter * self.nwalkers
-
         # to ensure iterations are counted properly, the sampler's lastclear
         # should be the same as start
         self._lastclear = startiter
-
+        # keep track of the number of iterations we've done
+        self._itercounter = startiter
+        # figure out the interval to use
         iterinterval = self.checkpoint_interval
         if iterinterval is None:
             iterinterval = int(numpy.ceil(
                 float(self.target_nsamples) / self.nwalkers))
-
         # run sampler until we have the desired number of samples
         while nsamples < self.target_nsamples:
-
             enditer = startiter + iterinterval
-
             # adjust the interval if we would go past the number of iterations
             endnsamp = enditer * self.nwalkers
             if endnsamp > self.target_nsamples \
                     and not self.require_indep_samples:
                 iterinterval = int(numpy.ceil(
                     (endnsamp - self.target_nsamples) / self.nwalkers))
-
             # run sampler and set initial values to None so that sampler
             # picks up from where it left off next call
             logging.info("Running sampler for {} to {} iterations".format(
                 startiter, enditer))
+            # run the underlying sampler for the desired interval
             self.run_mcmc(iterinterval)
-
+            # dump the current results
+            self.checkpoint()
             # update nsamples for next loop
-            if opts.n_independent_samples is not None:
-                with InferenceFile(checkpoint_file, 'r') as fp:
-                    nsamples = fp.n_independent_samples
+            if self.require_indep_samples:
+                nsamples = self.n_indep_samples
                 logging.info("Have {} independent samples".format(nsamples))
             else:
-                nsamples += interval
-
+                nsamples += iterinterval * self.nwalkers
+            self._itercounter = startiter = enditer
 
-            # clear the in-memory chain to save memory
-            logging.info("Clearing chain")
-            sampler.clear_chain()
-
-            start = end
+    @abstractproperty
+    def n_indep_samples(self):
+        """Should return the number of independent samples the sampler has
+        acquired so far."""
+        pass
 
     @abstractmethod
-    def run_for_niterations(self, niterations):
+    def run_mcmc(self, niterations):
         """Run the MCMC for the given number of iterations."""
         pass
 
     def checkpoint(self):
         """Dumps current samples to the checkpoint file."""
         # write new samples
+        logging.info("Writing samples to file")
+        self.write_results(self.checkpoint_file)
+        # write other stuff
         with self.io(checkpoint_file, "a") as fp:
-
-            logging.info("Writing samples to file")
-            sampler.write_results(fp, static_params=model.static_params,
-                                  ifos=opts.instruments)
-            logging.info("Updating burn in")
-            burnidx, is_burned_in = burn_in_eval.update(sampler, fp)
+            # write the current number of iterations
+            fp.attrs['niterations'] = self.niterations
+            # FIXME
+            # logging.info("Updating burn in")
+            # burnidx, is_burned_in = burn_in_eval.update(self, fp)
 
             # compute the acls and write
             acls = None
-            if opts.n_independent_samples is not None or end >= get_nsamples \
-                    or not opts.checkpoint_fast:
+            if self.require_indep_samples:
                 logging.info("Computing acls")
-                acls = sampler.compute_acls(fp)
+                acls = self.compute_acls(fp)
                 sampler.write_acls(fp, acls)
 
         # write to backup
         with InferenceFile(backup_file, "a") as fp:
 
             logging.info("Writing to backup file")
-            sampler.write_results(fp, static_params=model.static_params,
+            sampler.write_results(fp,
                                   ifos=opts.instruments)
             sampler.write_burn_in_iterations(fp, burnidx, is_burned_in)
             if acls is not None:
@@ -358,6 +355,10 @@ def checkpoint(self):
         if not checkpoint_valid:
             raise IOError("error writing to checkpoint file")
 
+        # clear the in-memory chain to save memory
+        logging.info("Clearing chain")
+        self.clear_chain()
+
 
     @classmethod
     def compute_acfs(cls, fp, start_index=None, end_index=None,
diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py
index 7cad975..e01ce7a 100644
--- a/gwin/sampler/emcee.py
+++ b/gwin/sampler/emcee.py
@@ -29,10 +29,13 @@
 from __future__ import absolute_import
 
 import numpy
+import emcee
 from pycbc.io import FieldArray
 from pycbc.filter import autocorrelation
+from pycbc.pool import choose_pool
 
-from .base import BaseMCMCSampler
+from .base import BaseSampler
+from .base_mcmc import (BaseMCMC, raw_samples_to_dict, raw_stats_to_dict)
 
 
 #
@@ -60,158 +63,103 @@ class EmceeEnsembleSampler(BaseMCMC, BaseSampler):
     """
     name = "emcee"
 
-    def __init__(self, model, nwalkers, pool=None,
-                 model_call=None):
-        try:
-            import emcee
-        except ImportError:
-            raise ImportError("emcee is not installed.")
+    def __init__(self, model, outfile, nwalkers,
+                 checkpoint_interval=None, resume_from_checkpoint=True,
+                 n_independent_samples=None, niterations=None,
+                 logpost_function=None,
+                 nprocesses=1, use_mpi=False):
+
+        self.model = model
+        # create a wrapper for calling the model
+        if logpost_function is None:
+            logpost_function = ='logposterior'
+        model_call = models.CallModel(model, logpost_function)
+
+        # Set up the pool
+        if nprocesses > 1:
+            # these are used to help paralleize over multiple cores / MPI
+            models._global_instance = model_call
+            model_call = models._call_global_model
+        pool = choose_pool(mpi=use_mpi, processes=nprocesses)
+        if pool is not None:
+            pool.count = nprocesses
+
+        self.outfile = outfile
+        self._nwalkers = nwalkers
 
-        if model_call is None:
-            model_call = model
+        # set up checkpointing
+        self.setup_checkpoint(outfile,
+            resume_from_checkpoint=resume_from_checkpoint)
 
+        # set up emcee
         ndim = len(model.variable_params)
-        sampler = emcee.EnsembleSampler(nwalkers, ndim,
-                                        model_call,
-                                        pool=pool)
+        self._sampler = emcee.EnsembleSampler(nwalkers, ndim, model_call,
+                                              pool=pool)
         # emcee uses it's own internal random number generator; we'll set it
         # to have the same state as the numpy generator
         rstate = numpy.random.get_state()
-        sampler.random_state = rstate
-        self._sampler = sampler
-        self._nwalkers = nwalkers
+        self._sampler.random_state = rstate
 
-    @classmethod
-    def from_cli(cls, opts, model, pool=None,
-                 model_call=None):
-        """Create an instance of this sampler from the given command-line
-        options.
+    @property
+    def io(self):
+        return EmceeFile
 
-        Parameters
-        ----------
-        opts : ArgumentParser options
-            The options to parse.
-        model : LikelihoodEvaluator
-            The model to use with the sampler.
+    def _write_more_metadata(self, fp):
+        """Adds nwalkers to the metadata."""
+        fp.attrs['nwalkers'] = self.nwalkers
 
-        Returns
-        -------
-        EmceeEnsembleSampler
-            An emcee sampler initialized based on the given arguments.
-        """
-        return cls(model, opts.nwalkers,
-                   pool=pool, model_call=model_call)
+    @property
+    def base_shape(self):
+        return (self.nwalkers,)
 
     @property
-    def raw_samples(self):
-        """A dict mapping sampling_params to arrays of samples currently
+    def samples(self):
+        """A dict mapping ``variable_params`` to arrays of samples currently
         in memory.
         
-        The arrays have shape ``nwalkers`` x ``niterations``.
+        The arrays have shape ``nwalkers x niterations``.
         """
-        # chain is a [additional dimensions x] niterations x ndim array
-        samples = self.chain
-        sampling_params = self.sampling_params
-        # convert to dictionary to apply boundary conditions
-        samples = {param: samples[..., ii] for
-                   ii, param in enumerate(sampling_params)}
-        samples = self.model._prior.apply_boundary_conditions(
-            **samples)
-        # now convert to field array
-        samples = FieldArray.from_arrays([samples[param]
-                                          for param in sampling_params],
-                                         names=sampling_params)
-        # apply transforms to go to model params space
-        return self.model.apply_sampling_transforms(
-            samples, inverse=True)
+        # emcee stores samples to it's chain attribute as a
+        # nwalker x niterations x ndim array
+        raw_samples = self._sampler.chain
+        return raw_samples_to_dict(self, raw_samples)
 
     @property
     def model_stats(self):
-        """Returns the model stats as a FieldArray, with field names
-        corresponding to the type of data returned by the model.
-        The returned array has shape nwalkers x niterations. If no additional
-        stats were returned to the sampler by the model, returns
-        None.
+        """A dict mapping the model's ``default_stats`` to arrays of values.
+        
+        The returned array has shape ``nwalkers x niterations``.
         """
-        stats = numpy.array(self._sampler.blobs)
-        if stats.size == 0:
-            return None
-        # we'll force arrays to float; this way, if there are `None`s in the
-        # blobs, they will be changed to `nan`s
-        arrays = {field: stats[..., fi].astype(float)
-                  for fi, field in
-                  enumerate(self.model.metadata_fields)}
-        return FieldArray.from_kwargs(**arrays).transpose()
+        return raw_samples_to_dict(self._sampler.blobs, raw_stats)
 
-    @property
-    def lnpost(self):
-        """Get the natural logarithm of the likelihood as an
-        nwalkers x niterations array.
-        """
-        # emcee returns nwalkers x niterations
-        return self._sampler.lnprobability
-
-    @property
-    def chain(self):
-        """Get all past samples as an nwalker x niterations x ndim array."""
-        # emcee returns the chain as nwalker x niterations x ndim
-        return self._sampler.chain
-
-    def clear_chain(self):
-        """Clears the chain and blobs from memory.
+    def clear_samples(self):
+        """Clears the samples and stats from memory.
         """
         # store the iteration that the clear is occuring on
-        self.lastclear = self.niterations
+        self._lastclear = self.niterations
         # now clear the chain
         self._sampler.reset()
         self._sampler.clear_blobs()
 
-    def set_p0(self, samples_file=None, prior=None):
-        """Sets the initial position of the walkers.
-
-        Parameters
-        ----------
-        samples_file : InferenceFile, optional
-            If provided, use the last iteration in the given file for the
-            starting positions.
-        prior : JointDistribution, optional
-            Use the given prior to set the initial positions rather than
-            ``model``'s prior.
-
-        Returns
-        -------
-        p0 : array
-            An nwalkers x ndim array of the initial positions that were set.
-        """
-        # we define set_p0 here to ensure that emcee's internal random number
-        # generator is set to numpy's after the distributions' rvs functions
-        # are called
-        super(EmceeEnsembleSampler, self).set_p0(samples_file=samples_file,
-                                                 prior=prior)
-        # update the random state
-        self._sampler.random_state = numpy.random.get_state()
-
-    def write_state(self, fp):
-        """Saves the state of the sampler in a file.
-        """
-        fp.write_random_state(state=self._sampler.random_state)
-
-    def set_state_from_file(self, fp):
+    def set_state_from_file(self, filename):
         """Sets the state of the sampler back to the instance saved in a file.
         """
-        rstate = fp.read_random_state()
+        with self.io(filename, 'r') as fp:
+            rstate = fp.read_random_state()
         # set the numpy random state
         numpy.random.set_state(rstate)
         # set emcee's generator to the same state
         self._sampler.random_state = rstate
 
-    def run(self, niterations, **kwargs):
+    def run_mcmc(self, niterations, **kwargs):
         """Advance the ensemble for a number of samples.
 
         Parameters
         ----------
         niterations : int
-            Number of samples to get from sampler.
+            Number of iterations to run the sampler for.
+        \**kwargs :
+            All other keyword arguments are passed to the emcee sampler.
 
         Returns
         -------
@@ -227,37 +175,44 @@ def run(self, niterations, **kwargs):
         if pos is None:
             pos = self.p0
         res = self._sampler.run_mcmc(pos, niterations, **kwargs)
-        p, lnpost, rstate = res[0], res[1], res[2]
+        p, _, _ = res[0], res[1], res[2]
         # update the positions
         self._pos = p
-        return p, lnpost, rstate
 
-    def write_results(self, fp, start_iteration=None,
-                      max_iterations=None, **metadata):
-        """Writes metadata, samples, model stats, and acceptance fraction
-        to the given file. See the write function for each of those for
-        details.
+    def write_results(self, filename):
+        """Writes samples, model stats, acceptance fraction, and random state
+        to the given file.
 
         Parameters
         -----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        start_iteration : int, optional
-            Write results to the file's datasets starting at the given
-            iteration. Default is to append after the last iteration in the
-            file.
-        max_iterations : int, optional
-            Set the maximum size that the arrays in the hdf file may be resized
-            to. Only applies if the samples have not previously been written
-            to file. The default (None) is to use the maximum size allowed by
-            h5py.
-        \**metadata :
-            All other keyword arguments are passed to ``write_metadata``.
+        filename : str
+            The file to write to. The file is opened using the ``io`` class
+            in an an append state.
         """
-        self.write_metadata(fp, **metadata)
-        self.write_chain(fp, start_iteration=start_iteration,
-                         max_iterations=max_iterations)
-        self.write_model_stats(fp, start_iteration=start_iteration,
-                               max_iterations=max_iterations)
-        self.write_acceptance_fraction(fp)
-        self.write_state(fp)
+        with self.io(filename, 'a') as fp:
+            # write samples
+            fp.write_samples(self.samples, self.model.variable_params)
+            # write stats
+            fp.write_samples(self.model_stats)
+            # write accpetance
+            fp.write_acceptance_fraction(self._sampler.acceptance_fraction)
+            # write random state
+            fp.write_random_state(state=self._sampler.random_state)
+
+
+    @classmethod
+    def from_config(cls, cp, model, outfile, nprocesses=1, use_mpi=False):
+        """Loads the sampler from the given config file."""
+        section = "sampler"
+        # check name
+        assert cp.get(section, "name") == cls.name, (
+            "name in section [sampler] must match mine")
+        # get the number of walkers to use
+        nwalkers = int(cp.get(section, "nwalkers"))
+        if cp.has_option(section, "logpost-function"):
+            lnpost = cp.get(section, "logpost-function")
+        else:
+            lnpost = None
+        return cls(model, outfile, nwalkers, logpost_function=lnpost,
+                   nprocesses=nprocesses, use_mpi=use_mpi)
+

From 866f39a8658beb58e873314e7917d7b94bdbccb6 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Mon, 16 Jul 2018 19:57:35 -0400
Subject: [PATCH 17/47] fix whitespace

---
 gwin/sampler/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py
index 1a2718b..e0cb543 100644
--- a/gwin/sampler/base.py
+++ b/gwin/sampler/base.py
@@ -262,6 +262,7 @@ def create_new_output_file(sampler, filename, force=False, injection_file=None,
             # just use the first one
             fp.write_injections(injection_file)
 
+
 def intial_dist_from_config(cp):
     """Loads a distribution for the sampler start from the given config file.
 

From 5b90d77b89ab62eda5033d3ae5d8f11a9367a302 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Mon, 16 Jul 2018 21:04:45 -0400
Subject: [PATCH 18/47] add acl support

---
 gwin/io/base_hdf.py       |   6 +-
 gwin/io/base_mcmc.py      |  52 +++++++++-
 gwin/sampler/base_mcmc.py | 205 +++++++++++++++-----------------------
 gwin/sampler/emcee.py     |   2 +-
 4 files changed, 138 insertions(+), 127 deletions(-)

diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index 39fd96a..855d6ac 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -144,7 +144,7 @@ def read_samples(self, parameters, array_class=None, **kwargs):
             and ``parse_parameters`` methods. If None, will return a
             ``FieldArray``.
         \**kwargs :
-            All other keyword arguments are passed to ``_read_samples_data``.
+            All other keyword arguments are passed to ``read_raw_samples``.
 
         Returns
         -------
@@ -157,7 +157,7 @@ def read_samples(self, parameters, array_class=None, **kwargs):
         # get the names of fields needed for the given parameters
         possible_fields = self[self.samples_group].keys()
         loadfields = array_class.parse_parameters(parameters, possible_fields)
-        samples = self._read_samples_data(loadfields, **kwargs)
+        samples = self.read_raw_samples(loadfields, **kwargs)
         # convert to FieldArray
         samples = array_class.from_kwargs(**samples)
         # add the static params
@@ -166,7 +166,7 @@ def read_samples(self, parameters, array_class=None, **kwargs):
         return samples
 
     @abstractmethod
-    def _read_samples_data(self, fields, **kwargs):
+    def read_raw_samples(self, fields, **kwargs):
         """Low level function for reading datasets in the samples group.
 
         This should return a dictionary of numpy arrays.
diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py
index 302aed3..3c54d77 100644
--- a/gwin/io/base_mcmc.py
+++ b/gwin/io/base_mcmc.py
@@ -112,7 +112,7 @@ def write_samples(self, samples, parameters=None,
                                   dtype=float, fletcher32=True)
             fp[dataset_name][:, istart:istop] = samples[param]
 
-    def _read_samples_data(self, fields,
+    def read_raw_samples(self, fields,
                            thin_start=None, thin_interval=None, thin_end=None,
                            iteration=None, walkers=None, flatten=True):
         """Base function for reading samples.
@@ -167,3 +167,53 @@ def write_resume_point(self):
         resume_pts.append(niterations)
         self.attrs["resume_points"] = resume_pts
 
+    def write_acls(self, acls):
+        """Writes the given autocorrelation lengths.
+
+        The ACL of each parameter is saved to
+        ``[sampler_group]/acls/{param}']``.  The maximum over all the
+        parameters is saved to the file's 'acl' attribute.
+
+        Parameters
+        ----------
+        acls : dict
+            A dictionary of ACLs keyed by the parameter.
+
+        Returns
+        -------
+        ACL
+            The maximum of the acls that was written to the file.
+        """
+        group = self.sampler_group + '/acls/{}'
+        # write the individual acls
+        for param in acls:
+            try:
+                # we need to use the write_direct function because it's
+                # apparently the only way to update scalars in h5py
+                self[group.format(param)].write_direct(
+                    numpy.array(acls[param]))
+            except KeyError:
+                # dataset doesn't exist yet
+                self[group.format(param)] = acls[param]
+        # write the maximum over all params
+        self.attrs['acl'] = numpy.array(acls.values()).max()
+        return self.attrs['acl']
+
+    def read_acls(self):
+        """Reads the acls of all the parameters.
+
+        Parameters
+        ----------
+        fp : InferenceFile
+            An open file handler to read the acls from.
+
+        Returns
+        -------
+        dict
+            A dictionary of the ACLs, keyed by the parameter name.
+        """
+        group = self[self.sampler_group]['acls']
+        return {param: group[param].value for param in group.keys()}
+
+
+
diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index 51f7ef8..8e1f06c 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -324,44 +324,54 @@ def checkpoint(self):
         # write new samples
         logging.info("Writing samples to file")
         self.write_results(self.checkpoint_file)
-        # write other stuff
-        with self.io(checkpoint_file, "a") as fp:
-            # write the current number of iterations
-            fp.attrs['niterations'] = self.niterations
-            # FIXME
-            # logging.info("Updating burn in")
-            # burnidx, is_burned_in = burn_in_eval.update(self, fp)
-
-            # compute the acls and write
-            acls = None
-            if self.require_indep_samples:
-                logging.info("Computing acls")
-                acls = self.compute_acls(fp)
-                sampler.write_acls(fp, acls)
-
-        # write to backup
-        with InferenceFile(backup_file, "a") as fp:
-
-            logging.info("Writing to backup file")
-            sampler.write_results(fp,
-                                  ifos=opts.instruments)
-            sampler.write_burn_in_iterations(fp, burnidx, is_burned_in)
-            if acls is not None:
-                sampler.write_acls(fp, acls)
-
+        logging.info("Writing to backup file")
+        self.write_results(self.backup_file)
+        # compute the acls
+        acls = None
+        if self.require_indep_samples:
+            logging.info("Computing acls")
+            acls = self.compute_acls(self.checkpoint_file)
+        # FIXME:
+        # logging.info("Updating burn in")
+        # burnidx, is_burned_in = burn_in_eval.update(self, fp)
+        # write
+        for fn in [self.checkpoint_file, self.backup_file]:
+            with self.io(fn, "a") as fp:
+                # write the current number of iterations
+                fp.attrs['niterations'] = self.niterations
+                # FIXME:
+                #sampler.write_burn_in_iterations(fp, burnidx, is_burned_in)
+                if acls is not None:
+                    fp.write_acls(acls)
         # check validity
-        checkpoint_valid = validate_checkpoint_files(checkpoint_file,
-                                                     backup_file)
+        checkpoint_valid = validate_checkpoint_files(
+            self.checkpoint_file, self.backup_file)
         if not checkpoint_valid:
             raise IOError("error writing to checkpoint file")
-
         # clear the in-memory chain to save memory
         logging.info("Clearing chain")
         self.clear_chain()
 
+    @abstractmethod
+    def compute_acf(cls, filename, **kwargs):
+        """A method to compute the autocorrelation function of samples in the
+        given file."""
+        pass
+
+    @abstractmethod
+    def compute_acl(cls, filename, **kwargs):
+        """A method to compute the autocorrelation length of samples in the
+        given file."""
+        pass
+
+
+
+class EnsembleMCMCAutocorrSupport(object):
+    """Provides class methods for calculating ensemble ACFs/ACLs.
+    """
 
     @classmethod
-    def compute_acfs(cls, fp, start_index=None, end_index=None,
+    def compute_acfs(cls, filename, start_index=None, end_index=None,
                      per_walker=False, walkers=None, parameters=None):
         """Computes the autocorrleation function of the model params in the
         given file.
@@ -372,8 +382,8 @@ def compute_acfs(cls, fp, start_index=None, end_index=None,
 
         Parameters
         -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
+        filename : str
+            Name of a samples file to compute ACFs for.
         start_index : {None, int}
             The start index to compute the acl from. If None, will try to use
             the number of burn-in iterations in the file; otherwise, will start
@@ -392,39 +402,41 @@ def compute_acfs(cls, fp, start_index=None, end_index=None,
 
         Returns
         -------
-        FieldArray
-            A ``FieldArray`` of the ACF vs iteration for each parameter. If
-            `per-walker` is True, the FieldArray will have shape
+        dict :
+            Dictionary of arrays giving the ACFs for each parameter. If
+            ``per-walker`` is True, the arrays will have shape
             ``nwalkers x niterations``.
         """
         acfs = {}
-        if parameters is None:
-            parameters = fp.variable_params
-        if isinstance(parameters, str) or isinstance(parameters, unicode):
-            parameters = [parameters]
-        for param in parameters:
-            if per_walker:
-                # just call myself with a single walker
-                if walkers is None:
-                    walkers = numpy.arange(fp.nwalkers)
-                arrays = [cls.compute_acfs(fp, start_index=start_index,
-                                           end_index=end_index,
-                                           per_walker=False, walkers=ii,
-                                           parameters=param)[param]
-                          for ii in walkers]
-                acfs[param] = numpy.vstack(arrays)
-            else:
-                samples = cls.read_samples(fp, param,
-                                           thin_start=start_index,
-                                           thin_interval=1, thin_end=end_index,
-                                           walkers=walkers,
-                                           flatten=False)[param]
-                samples = samples.mean(axis=0)
-                acfs[param] = autocorrelation.calculate_acf(samples).numpy()
-        return FieldArray.from_kwargs(**acfs)
+        with cls.io(filename, 'r') as fp:
+            if parameters is None:
+                parameters = fp.variable_params
+            if isinstance(parameters, str) or isinstance(parameters, unicode):
+                parameters = [parameters]
+            for param in parameters:
+                if per_walker:
+                    # just call myself with a single walker
+                    if walkers is None:
+                        walkers = numpy.arange(fp.nwalkers)
+                    arrays = [
+                        cls.compute_acfs(filename, start_index=start_index,
+                                         end_index=end_index,
+                                         per_walker=False, walkers=ii,
+                                         parameters=param)[param]
+                        for ii in walkers]
+                    acfs[param] = numpy.vstack(arrays)
+                else:
+                    samples = fp.read_raw_samples(
+                        fp, param, thin_start=start_index, thin_interval=1,
+                        thin_end=end_index, walkers=walkers,
+                        flatten=False)[param]
+                    samples = samples.mean(axis=0)
+                    acfs[param] = autocorrelation.calculate_acf(
+                        samples).numpy()
+        return acfs
 
     @classmethod
-    def compute_acls(cls, fp, start_index=None, end_index=None):
+    def compute_acls(cls, filename, start_index=None, end_index=None):
         """Computes the autocorrleation length for all model params in the
         given file.
 
@@ -434,8 +446,8 @@ def compute_acls(cls, fp, start_index=None, end_index=None):
 
         Parameters
         -----------
-        fp : InferenceFile
-            An open file handler to read the samples from.
+        filename : str
+            Name of a samples file to compute ACLs for.
         start_index : {None, int}
             The start index to compute the acl from. If None, will try to use
             the number of burn-in iterations in the file; otherwise, will start
@@ -450,69 +462,18 @@ def compute_acls(cls, fp, start_index=None, end_index=None):
             A dictionary giving the ACL for each parameter.
         """
         acls = {}
-        for param in fp.variable_params:
-            samples = cls.read_samples(fp, param,
-                                       thin_start=start_index,
-                                       thin_interval=1, thin_end=end_index,
-                                       flatten=False)[param]
-            samples = samples.mean(axis=0)
-            acl = autocorrelation.calculate_acl(samples)
-            if numpy.isinf(acl):
-                acl = samples.size
-            acls[param] = acl
+        with cls.io(filename, 'r') as fp:
+            for param in fp.variable_params:
+                samples = fp.read_raw_samples(
+                    fp, param, thin_start=start_index, thin_interval=1,
+                    thin_end=end_index, flatten=False)[param]
+                samples = samples.mean(axis=0)
+                acl = autocorrelation.calculate_acl(samples)
+                if numpy.isinf(acl):
+                    acl = samples.size
+                acls[param] = acl
         return acls
 
-    @staticmethod
-    def write_acls(fp, acls):
-        """Writes the given autocorrelation lengths to the given file.
-
-        The ACL of each parameter is saved to ``fp['acls/{param}']``.
-        The maximum over all the parameters is saved to the file's 'acl'
-        attribute.
-
-        Parameters
-        ----------
-        fp : InferenceFile
-            An open file handler to write the samples to.
-        acls : dict
-            A dictionary of ACLs keyed by the parameter.
-
-        Returns
-        -------
-        ACL
-            The maximum of the acls that was written to the file.
-        """
-        group = 'acls/{}'
-        # write the individual acls
-        for param in acls:
-            try:
-                # we need to use the write_direct function because it's
-                # apparently the only way to update scalars in h5py
-                fp[group.format(param)].write_direct(numpy.array(acls[param]))
-            except KeyError:
-                # dataset doesn't exist yet
-                fp[group.format(param)] = acls[param]
-        # write the maximum over all params
-        fp.attrs['acl'] = numpy.array(acls.values()).max()
-        return fp.attrs['acl']
-
-    @staticmethod
-    def read_acls(fp):
-        """Reads the acls of all the parameters in the given file.
-
-        Parameters
-        ----------
-        fp : InferenceFile
-            An open file handler to read the acls from.
-
-        Returns
-        -------
-        dict
-            A dictionary of the ACLs, keyed by the parameter name.
-        """
-        group = fp['acls']
-        return {param: group[param].value for param in group.keys()}
-
 
 class MCMCBurnInSupport(object):
     """Provides methods for estimating burn-in."""
diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py
index e01ce7a..42663ca 100644
--- a/gwin/sampler/emcee.py
+++ b/gwin/sampler/emcee.py
@@ -46,7 +46,7 @@
 # =============================================================================
 #
 
-class EmceeEnsembleSampler(BaseMCMC, BaseSampler):
+class EmceeEnsembleSampler(EnsembleMCMCAutocorrSupport, BaseMCMC, BaseSampler):
     """This class is used to construct an MCMC sampler from the emcee
     package's EnsembleSampler.
 

From 764c7411d79cf313091c4aed58c10674b08b375e Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Mon, 16 Jul 2018 21:05:17 -0400
Subject: [PATCH 19/47] update executable

---
 bin/gwin | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/bin/gwin b/bin/gwin
index 6cf268d..9bf822c 100644
--- a/bin/gwin
+++ b/bin/gwin
@@ -157,8 +157,9 @@ with ctx:
     # construct class that will return the natural logarithm of likelihood
     model = gwin.models.read_from_config(cp, **model_args)
 
-    burn_in_eval = burn_in.BurnIn(opts.burn_in_function,
-                                min_iterations=opts.min_burn_in)
+    # FIXME: move to MCMC sampler
+    #burn_in_eval = burn_in.BurnIn(opts.burn_in_function,
+    #                            min_iterations=opts.min_burn_in)
 
     logging.info("Setting up sampler")
 
@@ -190,17 +191,10 @@ with ctx:
     if samples_file is not None:
         logging.info("Initial positions taken from last iteration in %s",
                      samples_file)
-        samples_file = sampler.io(samples_file, "r")
         init_prior = None
-    elif len(cp.get_subsections("initial")):
-        initial_dists = distributions.read_distributions_from_config(
-            cp, section="initial")
-        constraints = distributions.read_constraints_from_config(cp,
-            constraint_section="initial_constraint")
-        init_prior = distributions.JointDistribution(sampler.variable_params,
-            *initial_dists, **{"constraints" : constraints})
     else:
-        init_prior = None
+        # try to load an initial distribution from the config file
+        init_prior = gwin.sampler.inital_dist_from_config(cp)
 
     sampler.set_initial_conditions(intial_distribution=init_prior,
         samples_file=samples_file)
@@ -211,24 +205,24 @@ with ctx:
     # Run the sampler
     sampler.run()
 
-    # finalize and exit
+    # Finalize the output 
     sampler.finalize()
 
-    # compute evidence, if supported
-    with InferenceFile(checkpoint_file, 'a') as fp:
-        try:
-            lnz, dlnz = sampler.calculate_logevidence(fp)
-            logging.info("Saving evidence")
-            sampler.write_logevidence(fp, lnz, dlnz)
-        except NotImplementedError:
-            pass
+    # FIXME: move to emcee_pt's finalize method
+    #with InferenceFile(checkpoint_file, 'a') as fp:
+    #    try:
+    #        lnz, dlnz = sampler.calculate_logevidence(fp)
+    #        logging.info("Saving evidence")
+    #        sampler.write_logevidence(fp, lnz, dlnz)
+    #    except NotImplementedError:
+    #        pass
 
 # rename checkpoint to output and delete backup
 logging.info("Moving checkpoint to output")
-os.rename(checkpoint_file, opts.output_file)
+os.rename(sampler.checkpoint_file, opts.output_file)
 if not opts.save_backup:
     logging.info("Deleting backup file")
-    os.remove(backup_file)
+    os.remove(sampler.backup_file)
 
 # exit
 logging.info("Done")

From 24a9b4f9b50d6db4c1487f3522cba8898b5f0020 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Mon, 16 Jul 2018 21:14:31 -0400
Subject: [PATCH 20/47] add finalize to emcee, fix typos

---
 gwin/sampler/base_mcmc.py |  4 ++--
 gwin/sampler/emcee.py     | 37 ++++++++++++++++++++++---------------
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index 8e1f06c..8a49477 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -408,7 +408,7 @@ def compute_acfs(cls, filename, start_index=None, end_index=None,
             ``nwalkers x niterations``.
         """
         acfs = {}
-        with cls.io(filename, 'r') as fp:
+        with cls._io(filename, 'r') as fp:
             if parameters is None:
                 parameters = fp.variable_params
             if isinstance(parameters, str) or isinstance(parameters, unicode):
@@ -462,7 +462,7 @@ def compute_acls(cls, filename, start_index=None, end_index=None):
             A dictionary giving the ACL for each parameter.
         """
         acls = {}
-        with cls.io(filename, 'r') as fp:
+        with cls._io(filename, 'r') as fp:
             for param in fp.variable_params:
                 samples = fp.read_raw_samples(
                     fp, param, thin_start=start_index, thin_interval=1,
diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py
index 42663ca..9da1bba 100644
--- a/gwin/sampler/emcee.py
+++ b/gwin/sampler/emcee.py
@@ -62,17 +62,15 @@ class EmceeEnsembleSampler(EnsembleMCMCAutocorrSupport, BaseMCMC, BaseSampler):
         cores/nodes/etc.
     """
     name = "emcee"
+    _io = EmceeFile
 
-    def __init__(self, model, outfile, nwalkers,
-                 checkpoint_interval=None, resume_from_checkpoint=True,
-                 n_independent_samples=None, niterations=None,
-                 logpost_function=None,
+    def __init__(self, model, nwalkers, logpost_function=None,
                  nprocesses=1, use_mpi=False):
 
         self.model = model
         # create a wrapper for calling the model
         if logpost_function is None:
-            logpost_function = ='logposterior'
+            logpost_function = 'logposterior'
         model_call = models.CallModel(model, logpost_function)
 
         # Set up the pool
@@ -84,14 +82,8 @@ def __init__(self, model, outfile, nwalkers,
         if pool is not None:
             pool.count = nprocesses
 
-        self.outfile = outfile
-        self._nwalkers = nwalkers
-
-        # set up checkpointing
-        self.setup_checkpoint(outfile,
-            resume_from_checkpoint=resume_from_checkpoint)
-
         # set up emcee
+        self._nwalkers = nwalkers
         ndim = len(model.variable_params)
         self._sampler = emcee.EnsembleSampler(nwalkers, ndim, model_call,
                                               pool=pool)
@@ -102,7 +94,7 @@ def __init__(self, model, outfile, nwalkers,
 
     @property
     def io(self):
-        return EmceeFile
+        return self._io
 
     def _write_more_metadata(self, fp):
         """Adds nwalkers to the metadata."""
@@ -200,8 +192,23 @@ def write_results(self, filename):
             fp.write_random_state(state=self._sampler.random_state)
 
 
+    def finalize(self):
+        """Finalize the samples file."""
+        # Compute/write final ACL
+        acls = self.compute_acls(self.checkpoint_file)
+        # FIXME:
+        # logging.info("Updating burn in")
+        # burnidx, is_burned_in = burn_in_eval.update(self, fp)
+        # write
+        with self.io(self.checkpoint_file, "a") as fp:
+            # write the current number of iterations
+            fp.attrs['niterations'] = self.niterations
+            # FIXME:
+            #sampler.write_burn_in_iterations(fp, burnidx, is_burned_in)
+            fp.write_acls(acls)
+
     @classmethod
-    def from_config(cls, cp, model, outfile, nprocesses=1, use_mpi=False):
+    def from_config(cls, cp, model, nprocesses=1, use_mpi=False):
         """Loads the sampler from the given config file."""
         section = "sampler"
         # check name
@@ -213,6 +220,6 @@ def from_config(cls, cp, model, outfile, nprocesses=1, use_mpi=False):
             lnpost = cp.get(section, "logpost-function")
         else:
             lnpost = None
-        return cls(model, outfile, nwalkers, logpost_function=lnpost,
+        return cls(model, nwalkers, logpost_function=lnpost,
                    nprocesses=nprocesses, use_mpi=use_mpi)
 

From c35a28f177c1bd0bedc11e02bbeb7a5d6e8fdc75 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Tue, 17 Jul 2018 07:42:22 -0400
Subject: [PATCH 21/47] change write_posterior to expect filename, not file

---
 gwin/io/base_hdf.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index 855d6ac..06bbe34 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -103,8 +103,9 @@ def parse_parameters(self, parameters, array_class=None):
             to derive the virtual field or method), and/or a function of
             these.
         array_class : array class, optional
-            The type of array to use to parse the parameters. The class must have a
-            ``parse_parameters`` method. Default is to use a ``FieldArray``.
+            The type of array to use to parse the parameters. The class must
+            have a ``parse_parameters`` method. Default is to use a
+            ``FieldArray``.
 
         Returns
         -------
@@ -174,14 +175,14 @@ def read_raw_samples(self, fields, **kwargs):
         pass
 
     @abstractmethod
-    def write_posterior(self, posterior_fp, **kwargs):
+    def write_posterior(self, posterior_file, **kwargs):
         """This should write a posterior plus any other metadata to the given
         file.
 
         Parameters
         ----------
-        posterior_fp : open hdf file
-            The file to write to.
+        posterior_file : str
+            Name of the file to write to.
         \**kwargs :
             Any other keyword args the sampler needs to write the posterior.
         """

From 26fc718b99aae43af6c02e6c5eaaa33b8d30d495 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Fri, 20 Jul 2018 10:34:50 -0400
Subject: [PATCH 22/47] change burn in module to just have functions

---
 gwin/burn_in.py | 420 +++++++++++-------------------------------------
 1 file changed, 97 insertions(+), 323 deletions(-)

diff --git a/gwin/burn_in.py b/gwin/burn_in.py
index bcb4ef6..895ba44 100644
--- a/gwin/burn_in.py
+++ b/gwin/burn_in.py
@@ -19,374 +19,148 @@
 """
 
 import numpy
-
 from scipy.stats import ks_2samp
 
+from pycbc.filter import autocorrelation
+
 
-def ks_test(sampler, fp, threshold=0.9):
-    """Burn in based on whether the p-value of the KS test between the samples
-    at the last iteration and the samples midway along the chain for each
-    parameter is > ``threshold``.
+def ks_test(samples1, samples2, threshold=0.9):
+    """Applies a KS test to determine if two sets of samples are the same.
+
+    The ks test is applied parameter-by-parameter. If the two-tailed p-value
+    returned by the test is greater than ``threshold``, the samples are
+    considered to be the same.
 
     Parameters
     ----------
-    sampler : gwin.sampler
-        Sampler to determine burn in for. May be either an instance of a
-        `gwin.sampler`, or the class itself.
-    fp : InferenceFile
-        Open inference hdf file containing the samples to load for determing
-        burn in.
+    samples1 : dict
+        Dictionary of mapping parameters to the first set of samples.
+    samples2 : dict
+        Dictionary of mapping parameters to the second set of samples.
     threshold : float
         The thershold to use for the p-value. Default is 0.9.
 
     Returns
     -------
-    burn_in_idx : array
-        Array of indices giving the burn-in index for each chain.
-    is_burned_in : array
-        Array of booleans indicating whether each chain is burned in.
+    dict :
+        Dictionary mapping parameter names to booleans indicating whether the
+        given parameter passes the KS test.
     """
-    nwalkers = fp.nwalkers
-    niterations = fp.niterations
-    # Create a dictionary which would have keys are the variable args and
-    # values are booleans indicating whether the p-value for the parameters
-    # satisfies the KS test
-    is_burned_in_param = {}
+    is_the_same = {}
+    assert set(samples1.keys()) == set(samples2.keys()), (
+        "samples1 and 2 must have the same parameters")
     # iterate over the parameters
-    for param in fp.variable_params:
-        # read samples for the parameter from the last iteration of the chain
-        samples_last_iter = sampler.read_samples(fp, param, iteration=-1,
-                                                 flatten=True)[param]
-        # read samples for the parameter from the iteration midway
-        # along the chain
-        samples_chain_midpt = sampler.read_samples(
-            fp, param, iteration=int(niterations/2), flatten=True)[param]
+    for param in samples1:
+        s1 = samples1[param]
+        s2 = samples2[param]
         _, p_value = ks_2samp(samples_last_iter, samples_chain_midpt)
-        # check if p_value is > than the desired range
-        is_burned_in_param[param] = p_value > threshold
-
-    # The chains are burned in if the p-value of the KS test lies
-    # in the range [0.1,0.9] for all the parameters.
-    # If the KS test is passed, the chains have burned in at their
-    # mid-way point.
-    if all(is_burned_in_param.values()):
-        is_burned_in = numpy.ones(nwalkers, dtype=bool)
-        burn_in_idx = numpy.repeat(niterations/2, nwalkers).astype(int)
-    else:
-        is_burned_in = numpy.zeros(nwalkers, dtype=bool)
-        burn_in_idx = numpy.repeat(niterations, nwalkers).astype(int)
-    return burn_in_idx, is_burned_in
+        is_the_same[param] = p_value > threshold
+    return is_the_same
 
 
-def n_acl(sampler, fp, nacls=10):
+def n_acl(chain, nacls=5):
     """Burn in based on ACL.
 
-    The sampler is considered burned in if the number of itertions is >=
-    ``nacls`` times the maximum ACL over all parameters, as measured from the
-    first iteration.
+    This applies the following test to determine burn in:
+
+    1. The first half of the chain is ignored.
+
+    2. An ACL is calculated from the second half.
+
+    3. If ``nacls`` times the ACL is < the number of iterations / 2,
+       the chain is considered to be burned in at the half-way point.
 
     Parameters
     ----------
-    sampler : pycbc.inference.sampler
-        Sampler to determine burn in for. May be either an instance of a
-        `inference.sampler`, or the class itself.
-    fp : InferenceFile
-        Open inference hdf file containing the samples to load for determing
-        burn in.
-    nacls : int
-        Number of ACLs to use for burn in. Default is 10.
+    chain : array
+        The chain of samples to apply the test to. Must be 1D.
+    nacls : int, optional
+        Number of ACLs to use for burn in. Default is 5.
 
     Returns
     -------
-    burn_in_idx : array
-        Array of indices giving the burn-in index for each chain. By definition
-        of this function, all chains reach burn in at the same iteration. Thus
-        the returned array is the burn-in index repeated by the number of
-        chains.
-    is_burned_in : array
-        Array of booleans indicating whether each chain is burned in. Since
-        all chains obtain burn in at the same time, this is either an array
-        of all False or True.
+    burn_in_idx : int
+        The burn in index. If the chain is not burned in, will be equal to the
+        length of the chain.
+    is_burned_in : bool
+        Whether or not the chain is burned in.
+    acl : int
+        The ACL that was estimated.
     """
-    acl = numpy.array(sampler.compute_acls(fp, start_index=0).values()).max()
-    burn_idx = nacls * acl
-    is_burned_in = burn_idx < fp.niterations
-    if not is_burned_in:
-        burn_idx = fp.niterations
-    nwalkers = fp.nwalkers
-    return numpy.repeat(burn_idx, nwalkers).astype(int), \
-        numpy.repeat(is_burned_in, nwalkers).astype(bool)
+    kstart = int(len(chain)/2.)
+    acl = autocorrelation.calculate_acl(chain[kstart:])
+    is_burned_in = nacls * acl < kstart
+    if is_burned_in:
+        burn_in_idx = kstart
+    else:
+        burn_in_idx = len(chain)
+    return burn_in_idx, is_burned_in, acl
 
 
-def max_posterior(sampler, fp):
+def max_posterior(lnps_per_walker, dim):
     """Burn in based on samples being within dim/2 of maximum posterior.
 
     Parameters
     ----------
-    sampler : gwin.sampler
-        Sampler to determine burn in for. May be either an instance of a
-        `gwin.sampler`, or the class itself.
-    fp : InferenceFile
-        Open inference hdf file containing the samples to load for determing
-        burn in.
+    lnps_per_walker : 2D array
+        Array of values that are proportional to the log posterior values. Must
+        have shape ``nwalkers x niterations``.
+    dim : float
+        The dimension of the parameter space.
 
     Returns
     -------
-    burn_in_idx : array
-        Array of indices giving the burn-in index for each chain.
-    is_burned_in : array
-        Array of booleans indicating whether each chain is burned in.
+    burn_in_idx : array of int
+        The burn in indices of each walker. If a walker is not burned in, its
+        index will be be equal to the length of the chain.
+    is_burned_in : array of bool
+        Whether or not a walker is burned in.
     """
-    # get the posteriors
-    # Note: multi-tempered samplers should just return the coldest chain by
-    # default
-    chain_stats = sampler.read_samples(
-        fp, ['loglr', 'logprior'], samples_group=fp.stats_group,
-        thin_interval=1, thin_start=0, thin_end=None, flatten=False)
-    chain_posteriors = chain_stats['loglr'] + chain_stats['logprior']
-    dim = float(len(fp.variable_params))
-
-    # find the posterior to compare against
-    max_p = chain_posteriors.max()
-    criteria = max_p - dim/2
-    nwalkers = chain_posteriors.shape[-2]
-    niterations = chain_posteriors.shape[-1]
-    burn_in_idx = numpy.repeat(niterations, nwalkers).astype(int)
-    is_burned_in = numpy.zeros(nwalkers, dtype=bool)
-
-    # find the first iteration in each chain where the logplr has exceeded
+    if len(lnps_per_walker.shape) != 2:
+        raise ValueError("lnps_per_walker must have shape "
+                         "nwalkers x niterations")
+    # find the value to compare against
+    max_p = lnps_per_walker.max()
+    criteria = max_p - dim/2.
+    nwalkers, niterations = lnps_per_walker.shape
+    burn_in_idx = numpy.empty(nwalkers, dtype=int)
+    is_burned_in = numpy.empty(nwalkers, dtype=bool)
+    # find the first iteration in each chain where the logpost has exceeded
     # max_p - dim/2
     for ii in range(nwalkers):
-        chain = chain_posteriors[..., ii, :]
-        # numpy.where will return a tuple with multiple arrays if the chain is
-        # more than 1D (which can happen for multi-tempered samplers). Always
-        # taking the last array ensures we are looking at the indices that
-        # count out iterations
-        idx = numpy.where(chain >= criteria)[-1]
-        if idx.size != 0:
-            burn_in_idx[ii] = idx[0]
-            is_burned_in[ii] = True
+        chain = lnps_per_walker[ii,:]
+        passedidx = numpy.where(chain >= criteria)[0]
+        is_burned_in[ii] = is_burned_in = passedidx.size > 0
+        if is_burned_in:
+            burn_in_idx[ii] = passedidx[0]
+        else:
+            burn_in_idx[ii] = niterations
     return burn_in_idx, is_burned_in
 
 
-def posterior_step(sampler, fp):
-    """Burn in based on the last time a chain made a jump > dim/2.
+def posterior_step(logposts, dim):
+    """Finds the last time a chain made a jump > dim/2.
 
     Parameters
     ----------
-    sampler : gwin.sampler
-        Sampler to determine burn in for. May be either an instance of a
-        `gwin.sampler`, or the class itself.
-    fp : InferenceFile
-        Open inference hdf file containing the samples to load for determing
-        burn in.
+    logposts : array
+        1D array of values that are proportional to the log posterior values.
+    dim : float
+        The dimension of the parameter space.
 
     Returns
     -------
-    burn_in_idx : array
-        Array of indices giving the burn-in index for each chain.
-    is_burned_in : array
-        Array of booleans indicating whether each chain is burned in.
-        By definition of this function, all values are set to True.
+    int
+        The index of the last time the logpost made a jump > dim/2. If that
+        never happened, returns 0.
     """
-    # get the posteriors
-    # Note: multi-tempered samplers should just return the coldest chain by
-    # default
-    chain_stats = sampler.read_samples(
-        fp, ['loglr', 'logprior'], samples_group=fp.stats_group,
-        thin_interval=1, thin_start=0, thin_end=None, flatten=False)
-    chain_posteriors = chain_stats['loglr'] + chain_stats['logprior']
-    nwalkers = chain_posteriors.shape[-2]
-    dim = float(len(fp.variable_params))
-    burn_in_idx = numpy.zeros(nwalkers).astype(int)
+    if logposts.ndim > 1:
+        raise ValueError("logposts must be a 1D array")
     criteria = dim/2.
-
-    # find the last iteration in each chain where the logplr has
-    # jumped by more than dim/2
-    for ii in range(nwalkers):
-        chain = chain_posteriors[..., ii, :]
-        dp = abs(numpy.diff(chain))
-        idx = numpy.where(dp >= criteria)[-1]
-        if idx.size != 0:
-            burn_in_idx[ii] = idx[-1] + 1
-    return burn_in_idx, numpy.ones(nwalkers, dtype=bool)
-
-
-def half_chain(sampler, fp):
-    """Takes the second half of the iterations as post-burn in.
-
-    Parameters
-    ----------
-    sampler : gwin.sampler
-        This option is not used; it is just here give consistent API as the
-        other burn in functions.
-    fp : InferenceFile
-        Open inference hdf file containing the samples to load for determing
-        burn in.
-
-    Returns
-    -------
-    burn_in_idx : array
-        Array of indices giving the burn-in index for each chain.
-    is_burned_in : array
-        Array of booleans indicating whether each chain is burned in.
-        By definition of this function, all values are set to True.
-    """
-    nwalkers = fp.nwalkers
-    niterations = fp.niterations
-    return (
-        numpy.repeat(niterations/2, nwalkers).astype(int),
-        numpy.ones(nwalkers, dtype=bool),
-    )
-
-
-def use_sampler(sampler, fp=None):
-    """Uses the sampler's burn_in function.
-
-    Parameters
-    ----------
-    sampler : gwin.sampler
-        Sampler to determine burn in for. Must be an instance of an
-        `gwin.sampler` that has a `burn_in` function.
-    fp : InferenceFile, optional
-        This option is not used; it is just here give consistent API as the
-        other burn in functions.
-
-    Returns
-    -------
-    burn_in_idx : array
-        Array of indices giving the burn-in index for each chain.
-    is_burned_in : array
-        Array of booleans indicating whether each chain is burned in.
-        Since the sampler's burn in function will run until all chains
-        are burned, all values are set to True.
-    """
-    sampler.burn_in()
-    return (
-        sampler.burn_in_iterations,
-        numpy.ones(len(sampler.burn_in_iterations), dtype=bool),
-    )
-
-
-burn_in_functions = {
-    'ks_test': ks_test,
-    'n_acl': n_acl,
-    'max_posterior': max_posterior,
-    'posterior_step': posterior_step,
-    'half_chain': half_chain,
-    'use_sampler': use_sampler,
-    }
-
-
-class BurnIn(object):
-    """Class to estimate the number of burn in iterations.
-
-    Parameters
-    ----------
-    function_names : list, optional
-        List of name of burn in functions to use. All names in the provided
-        list muset be in the `burn_in_functions` dict. If none provided, will
-        use no burn-in functions.
-    min_iterations : int, optional
-        Minimum number of burn in iterations to use. The burn in iterations
-        returned by evaluate will be the maximum of this value
-        and the values returned by the burn in functions provided in
-        `function_names`. Default is 0.
-
-    Examples
-    --------
-    Initialize a `BurnIn` instance that will use `max_posterior` and
-    `posterior_step` as the burn in criteria:
-
-    >>> import gwin
-    >>> burn_in = gwin.BurnIn(['max_posterior', 'posterior_step'])
-
-    Use this `BurnIn` instance to find the burn-in iteration of each walker
-    in an inference result file:
-
-    >>> from pycbc.io import InferenceFile
-    >>> fp = InferenceFile('gwin.hdf', 'r')
-    >>> burn_in.evaluate(gwin.samplers[fp.sampler_name], fp)
-    array([11486, 11983, 11894, ..., 11793, 11888, 11981])
-
-    """
-
-    def __init__(self, function_names, min_iterations=0):
-        if function_names is None:
-            function_names = []
-        self.min_iterations = min_iterations
-        self.burn_in_functions = {fname: burn_in_functions[fname]
-                                  for fname in function_names}
-
-    def evaluate(self, sampler, fp):
-        """Evaluates sampler's chains to find burn in.
-
-        Parameters
-        ----------
-        sampler : gwin.sampler
-            Sampler to determine burn in for. May be either an instance of a
-            `gwin.sampler`, or the class itself.
-        fp : InferenceFile
-            Open inference hdf file containing the samples to load for
-            determing burn in.
-
-        Returns
-        -------
-        burnidx : array
-            Array of indices giving the burn-in index for each chain.
-        is_burned_in : array
-            Array of booleans indicating whether each chain is burned in.
-        """
-        # if the number of iterations is < than the minimium desired,
-        # just return the number of iterations and all False
-        if fp.niterations < self.min_iterations:
-            return numpy.repeat(self.min_iterations, fp.nwalkers), \
-                   numpy.zeros(fp.nwalkers, dtype=bool)
-        # if the file already has burn in iterations saved, use those as a
-        # base
-        try:
-            burnidx = fp['burn_in_iterations'][:]
-        except KeyError:
-            # just use the minimum
-            burnidx = numpy.repeat(self.min_iterations, fp.nwalkers)
-        # start by assuming is burned in; the &= below will make this false
-        # if any test yields false
-        is_burned_in = numpy.ones(fp.nwalkers, dtype=bool)
-        if self.burn_in_functions != {}:
-            newidx = []
-            for func in self.burn_in_functions.values():
-                idx, state = func(sampler, fp)
-                newidx.append(idx)
-                is_burned_in &= state
-            newidx = numpy.vstack(newidx).max(axis=0)
-            # update the burn in idx if any test yields a larger iteration
-            mask = burnidx < newidx
-            burnidx[mask] = newidx[mask]
-        # if any burn-in idx are less than the min iterations, set to the
-        # min iterations
-        burnidx[burnidx < self.min_iterations] = self.min_iterations
-        return burnidx, is_burned_in
-
-    def update(self, sampler, fp):
-        """Evaluates burn in and saves the updated indices to the given file.
-
-        Parameters
-        ----------
-        sampler : gwin.sampler
-            Sampler to determine burn in for. May be either an instance of a
-            `gwin.sampler`, or the class itself.
-        fp : InferenceFile
-            Open inference hdf file containing the samples to load for
-            determing burn in.
-
-        Returns
-        -------
-        burnidx : array
-            Array of indices giving the burn-in index for each chain.
-        is_burned_in : array
-            Array of booleans indicating whether each chain is burned in.
-        """
-        burnidx, is_burned_in = self.evaluate(sampler, fp)
-        sampler.burn_in_iterations = burnidx
-        sampler.write_burn_in_iterations(fp, burnidx, is_burned_in)
-        return burnidx, is_burned_in
+    dp = numpy.diff(logposts)
+    indices = numpy.where(dp >= criteria)[0]
+    if indices.size > 0:
+        idx = indices[-1] + 1
+    else:
+        idx = 0
+    return idx

From 8d69b1598e80c60abe7d2a198cd672c89b3c60ad Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Fri, 20 Jul 2018 10:35:10 -0400
Subject: [PATCH 23/47] start to define burn in support class

---
 gwin/sampler/base_mcmc.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index 8a49477..ddc5289 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -476,7 +476,15 @@ def compute_acls(cls, filename, start_index=None, end_index=None):
 
 
 class MCMCBurnInSupport(object):
-    """Provides methods for estimating burn-in."""
+    """Provides methods for estimating burn-in of an ensemble MCMC."""
+
+    def __init__(self, burn_in_tests):
+        self.burn_in_tests = burn_in_tests
+
+    def _max_posterior(self, filename):
+        """Applies max posterior test to self."""
+        with self.io(filename, 'r') as fp:
+            samples = self.read_samples()
 
     def write_burn_in_iterations(fp, burn_in_iterations, is_burned_in=None):
         """Writes the burn in iterations to the given file.

From 8a6506a4ee21acc7796e652efa7ed402b5c193f0 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Mon, 30 Jul 2018 12:28:13 +0200
Subject: [PATCH 24/47] move burn in class to burn_in module; add evaluate

---
 gwin/burn_in.py           | 145 +++++++++++++++++++++++++++++++++++++-
 gwin/sampler/base_mcmc.py |  43 +----------
 2 files changed, 146 insertions(+), 42 deletions(-)

diff --git a/gwin/burn_in.py b/gwin/burn_in.py
index 895ba44..d872860 100644
--- a/gwin/burn_in.py
+++ b/gwin/burn_in.py
@@ -13,6 +13,14 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 
+
+#
+# =============================================================================
+#
+#                                   Preamble
+#
+# =============================================================================
+#
 """
 This modules provides classes and functions for determining when Markov Chains
 have burned in.
@@ -23,7 +31,16 @@
 
 from pycbc.filter import autocorrelation
 
+# The value to use for a burn-in iteration if a chain is not burned in
+NOT_BURNED_IN_ITER = -1
 
+#
+# =============================================================================
+#
+#                              Convenience functions
+#
+# =============================================================================
+#
 def ks_test(samples1, samples2, threshold=0.9):
     """Applies a KS test to determine if two sets of samples are the same.
 
@@ -93,7 +110,7 @@ def n_acl(chain, nacls=5):
     if is_burned_in:
         burn_in_idx = kstart
     else:
-        burn_in_idx = len(chain)
+        burn_in_idx = NOT_BURNED_IN_ITER
     return burn_in_idx, is_burned_in, acl
 
 
@@ -134,7 +151,7 @@ def max_posterior(lnps_per_walker, dim):
         if is_burned_in:
             burn_in_idx[ii] = passedidx[0]
         else:
-            burn_in_idx[ii] = niterations
+            burn_in_idx[ii] = NOT_BURNED_IN_ITER
     return burn_in_idx, is_burned_in
 
 
@@ -164,3 +181,127 @@ def posterior_step(logposts, dim):
     else:
         idx = 0
     return idx
+
+
+#
+# =============================================================================
+#
+#                              Burn in classes
+#
+# =============================================================================
+#
+
+from pycbc.io.record import get_vars_from_arg
+
+class MCMCBurnInSupport(object):
+    """Provides methods for estimating burn-in of an ensemble MCMC."""
+
+    default_burn_in_iteration = -1
+
+    def __init__(self, sampler, burn_in_test, **kwargs):
+        self.sampler = sampler
+        # determine the burn-in tests that are going to be done
+        self.do_tests = get_vars_from_arg(burn_in_test)
+        self.burn_in_test = burn_in_test
+        self.burn_in_data = {t: {} for t in self.do_tests}
+        self.is_burned_in = False
+        self.burn_in_iteration = None
+        if 'nacl' in burn_in_tests:
+            # get the number of acls to use
+            self._nacls = kwargs.pop('nacls', 5)
+        if 'ks_test' in burn_in_tests:
+            self._ksthreshold = kwargs.pop('ks_threshold', 0.9)
+
+    def max_posterior(self, filename):
+        """Applies max posterior test to self."""
+        with sampler.io(filename, 'r') as fp:
+            samples = fp.read_raw_samples(
+                ['loglikelihood', 'logprior'], thin_start=0, thin_interval=1,
+                flatten=False)
+            logposts = samples['loglikelihood'] + samples['logprior']
+        burn_in_idx, is_burned_in = burn_in.max_posterior(
+            logposts, len(self.variable_params))
+        data = self.burn_in_data['max_posterior']
+        # required things to store
+        data['is_burned_in'] = is_burned_in.all()
+        data['burn_in_iteration'] = burn_in_idx.max()
+        # additional info
+        data['iteration_per_walker'] = burn_in_idx
+        data['status_per_walker'] = is_burned_in
+
+    def nacl(self, filename):
+        """Applies the nacl burn-in test"""
+        with sampler.io(filename, 'r') as fp:
+            niters = fp.niterations
+        kstart = int(niters / 2.)
+        acls = sampler.compute_acls(filename, start_index=kstart)
+        is_burned_in = {param: (self._nacls * acl) < kstart
+                        for (param, acl) in acls.items()}
+        data = self.burn_in_data['nacl']
+        # required things to store
+        data['is_burned_in'] = all(is_burned_in.values())
+        if data['is_burned_in']:
+            data['burn_in_iteration'] = kstart
+        else:
+            data['burn_in_iteration'] = NOT_BURNED_IN_ITER
+        # additional information
+        data['status_per_parameter'] = is_burned_in
+        # since we calculated it, save the acls to the sampler
+        sampler.acls = acls
+
+    def ks_test(self, filename):
+        """Applies ks burn-in test."""
+        with sampler.io(filename, 'r') as fp:
+            niters = fp.niterations
+            # get the samples from the mid point
+            samples1 = fp.read_raw_samples(
+                ['loglikelihood', 'logprior'], iteration=int(niters/2.))
+            # get the last samples
+            samples2 = fp.read_raw_samples(
+                ['loglikelihood', 'logprior'], iteration=-1)
+        # do the test
+        # is_the_same is a dictionary of params --> bool indicating whether or
+        # not the 1D marginal is the same at the half way point
+        is_the_same = ks_test(samples1, samples2, threshold=self.ks_threshold) 
+        data = self.burn_in_data['ks_test']
+        # required things to store
+        data['is_burned_in'] = all(is_the_same.values()) 
+        if data['is_burned_in']:
+            data['burn_in_iteration'] = int(niters/2.)
+        else:
+            data['burn_in_iteration'] = NOT_BURNED_IN_ITER
+        # additional
+        data['status_per_parameter'] = is_the_same
+
+    def evaluate(self, filename):
+        """Runs all of the burn-in tests."""
+        for tst in self.tests_to_do:
+            getattr(self, tst)(filename)
+        # The iteration to use for burn-in depends on the logic in the burn-in
+        # test string. For example, if the test was 'max_posterior | nacl' and
+        # max_posterior burned-in at iteration 5000 while nacl burned in at
+        # iteration 6000, we'd want to use 5000 as the burn-in iteration.
+        # However, if the test was 'max_posterior & nacl', we'd want to use
+        # 6000 as the burn-in iteration. The code below handles all cases by
+        # doing the following: first, take the collection of burn in iterations
+        # from all the burn in tests that were applied.  Next, cycle over the
+        # iterations in increasing order, checking which tests have burned in
+        # by that point. Then evaluate the burn-in string at that point to see
+        # if it passes, and if so, what the iteration is. The first point that
+        # the test passes is used as the burn-in iteration.
+        burn_in_iters = numpy.unique([self.data[t]['burn_in_iteration']
+                                      for t in self.do_tests])
+        burn_in_iters.sort()
+        for ii in burn_in_iters:
+            test_results = {t: (self.data[t]['is_burned_in'] &
+                                self.data[t]['burn_in_iteration'] <= ii)
+                            for t in self.do_tests}
+            is_burned_in = eval(self.burn_in_test, {"__builtins__": None},
+                                test_results)
+            if is_burned_in:
+                break
+        self.is_burned_in = is_burned_in
+        if is_burned_in:
+            self.burn_in_iteration = ii
+        else:
+            self.burn_in_iteration = NOT_BURNED_IN_ITER
diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index ddc5289..83959bf 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -327,10 +327,10 @@ def checkpoint(self):
         logging.info("Writing to backup file")
         self.write_results(self.backup_file)
         # compute the acls
-        acls = None
+        self.acls = None
         if self.require_indep_samples:
             logging.info("Computing acls")
-            acls = self.compute_acls(self.checkpoint_file)
+            self.acls = self.compute_acls(self.checkpoint_file)
         # FIXME:
         # logging.info("Updating burn in")
         # burnidx, is_burned_in = burn_in_eval.update(self, fp)
@@ -341,7 +341,7 @@ def checkpoint(self):
                 fp.attrs['niterations'] = self.niterations
                 # FIXME:
                 #sampler.write_burn_in_iterations(fp, burnidx, is_burned_in)
-                if acls is not None:
+                if self.acls is not None:
                     fp.write_acls(acls)
         # check validity
         checkpoint_valid = validate_checkpoint_files(
@@ -473,40 +473,3 @@ def compute_acls(cls, filename, start_index=None, end_index=None):
                     acl = samples.size
                 acls[param] = acl
         return acls
-
-
-class MCMCBurnInSupport(object):
-    """Provides methods for estimating burn-in of an ensemble MCMC."""
-
-    def __init__(self, burn_in_tests):
-        self.burn_in_tests = burn_in_tests
-
-    def _max_posterior(self, filename):
-        """Applies max posterior test to self."""
-        with self.io(filename, 'r') as fp:
-            samples = self.read_samples()
-
-    def write_burn_in_iterations(fp, burn_in_iterations, is_burned_in=None):
-        """Writes the burn in iterations to the given file.
-
-        Parameters
-        ----------
-        fp : InferenceFile
-            A file handler to an open inference file.
-        burn_in_iterations : array
-            Array of values giving the iteration of the burn in of each walker.
-        is_burned_in : array
-            Array of booleans indicating which chains are burned in.
-        """
-        try:
-            fp['burn_in_iterations'][:] = burn_in_iterations
-        except KeyError:
-            fp['burn_in_iterations'] = burn_in_iterations
-        fp.attrs['burn_in_iterations'] = burn_in_iterations.max()
-        if is_burned_in is not None:
-            try:
-                fp['is_burned_in'][:] = is_burned_in
-            except KeyError:
-                fp['is_burned_in'] = is_burned_in
-            fp.attrs['is_burned_in'] = is_burned_in.all()
-

From 2711460e143c2d28a81d4a611005e004808843de Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Mon, 30 Jul 2018 12:28:45 +0200
Subject: [PATCH 25/47] add write burn in to io

---
 gwin/io/base_mcmc.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py
index 3c54d77..f068d12 100644
--- a/gwin/io/base_mcmc.py
+++ b/gwin/io/base_mcmc.py
@@ -215,5 +215,18 @@ def read_acls(self):
         group = self[self.sampler_group]['acls']
         return {param: group[param].value for param in group.keys()}
 
-
-
+    def write_burn_in(self, burn_in):
+        """Write the given burn-in data to the given filename."""
+        group = self[self.sampler_group]
+        group.attrs['is_burned_in'] = burn_in.is_burned_in
+        group.attrs['burn_in_iteration'] = burn_in.burn_in_iteration
+        group.attrs['burn_in_test'] = burn_in.burn_in_test
+        # write individual test data
+        for tst in burn_in.burn_in_data:
+            key = 'burn_in_tests/{}'.format(tst)
+            try:
+                attrs = group[key].attrs
+            except KeyError:
+                group.create_group(key)
+                attrs = group[key].attrs
+            write_kwargs_to_hdf_attrs(attrs, **burn_in.burn_in_data[tst])

From 59201ffdb3b8816399b2b92187a7d41da28ea64f Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Mon, 30 Jul 2018 14:43:46 +0200
Subject: [PATCH 26/47] add from_config for burn-in class

---
 gwin/burn_in.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/gwin/burn_in.py b/gwin/burn_in.py
index d872860..5e5793e 100644
--- a/gwin/burn_in.py
+++ b/gwin/burn_in.py
@@ -196,8 +196,6 @@ def posterior_step(logposts, dim):
 class MCMCBurnInSupport(object):
     """Provides methods for estimating burn-in of an ensemble MCMC."""
 
-    default_burn_in_iteration = -1
-
     def __init__(self, sampler, burn_in_test, **kwargs):
         self.sampler = sampler
         # determine the burn-in tests that are going to be done
@@ -208,9 +206,9 @@ def __init__(self, sampler, burn_in_test, **kwargs):
         self.burn_in_iteration = None
         if 'nacl' in burn_in_tests:
             # get the number of acls to use
-            self._nacls = kwargs.pop('nacls', 5)
+            self._nacls = int(kwargs.pop('nacls', 5))
         if 'ks_test' in burn_in_tests:
-            self._ksthreshold = kwargs.pop('ks_threshold', 0.9)
+            self._ksthreshold = float(kwargs.pop('ks_threshold', 0.9))
 
     def max_posterior(self, filename):
         """Applies max posterior test to self."""
@@ -305,3 +303,17 @@ def evaluate(self, filename):
             self.burn_in_iteration = ii
         else:
             self.burn_in_iteration = NOT_BURNED_IN_ITER
+
+    @classmethod
+    def from_config(cls, cp, sampler):
+        """Loads burn in from section [sampler-burn_in]."""
+        section = 'sampler'
+        tag = 'burn_in'
+        burn_in_test = cp.get_opt_tag(section, 'burn-in-test', tag)
+        kwargs = {}
+        if cp.has_option_tag(section, 'nacl', tag):
+            kwargs['nacl'] = int(cp.get_opt_tag(section, 'nacl', tag))
+        if cp.has_option_tag(section, 'ks-threshold', tag)
+            kwargs['ks_threshold'] = float(
+                cp.get_opt_tag(section, 'ks-threshold', tag)           
+        return cls(sampler, burn_in_test, **kwargs)

From 35a8408cd6d1e85553ac8b795ee172627f0e4bc3 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Mon, 30 Jul 2018 16:19:36 +0200
Subject: [PATCH 27/47] more support for burn-in, calculation of independent
 samples

---
 gwin/burn_in.py           |  2 +-
 gwin/sampler/base_mcmc.py | 60 ++++++++++++++++++++++++++++-----------
 gwin/sampler/emcee.py     | 30 +++++++++-----------
 3 files changed, 58 insertions(+), 34 deletions(-)

diff --git a/gwin/burn_in.py b/gwin/burn_in.py
index 5e5793e..6e83bb5 100644
--- a/gwin/burn_in.py
+++ b/gwin/burn_in.py
@@ -193,7 +193,7 @@ def posterior_step(logposts, dim):
 
 from pycbc.io.record import get_vars_from_arg
 
-class MCMCBurnInSupport(object):
+class MCMCBurnInTests(object):
     """Provides methods for estimating burn-in of an ensemble MCMC."""
 
     def __init__(self, sampler, burn_in_test, **kwargs):
diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index 83959bf..120846c 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -129,6 +129,7 @@ class BaseMCMC(object):
     _pos = None
     _p0 = None
     _nwalkers = None
+    _burn_in = None
 
     @abstractproperty(self):
     def base_shape(self):
@@ -259,8 +260,8 @@ def run(self):
         # "nsamples" keeps track of the number of samples we've obtained (if
         # require_indep_samples is used, this is the number of independent
         # samples; otherwise, this is the total number of samples).
-        # "startiter" is the number of iterations that the file already contains
-        # (either due to sampler burn-in, or a previous checkpoint)
+        # "startiter" is the number of iterations that the file already
+        # contains (either due to sampler burn-in, or a previous checkpoint)
         try:
             with self.io(self.checkpoint_file, "r") as fp:
                 start = fp.niterations
@@ -303,22 +304,44 @@ def run(self):
             # update nsamples for next loop
             if self.require_indep_samples:
                 nsamples = self.n_indep_samples
-                logging.info("Have {} independent samples".format(nsamples))
+                logging.info("Have {} independent samples post burn in".format(
+                    nsamples))
             else:
                 nsamples += iterinterval * self.nwalkers
             self._itercounter = startiter = enditer
 
-    @abstractproperty
+    @propetry
+    def burn_in(self):
+        """The class for doing burn-in tests (if specified)."""
+        return self._burn_in
+
+    def set_burn_in(self, burn_in):
+        """Sets the object to use for doing burn-in tests."""
+        self._burn_in = burn_in
+
     def n_indep_samples(self):
-        """Should return the number of independent samples the sampler has
+        """The number of independent samples post burn-in that the sampler has
         acquired so far."""
-        pass
+        if self.acls is None:
+            acl = numpy.inf
+        else:
+            acl = numpy.array(self.acls.values()).max()
+        if self.burn_in is None:
+            niters = self.niterations
+        else:
+            niters = self.niterations - self.burn_in.burn_in_iteration
+        return self.nwalkers * int(niters // acl)
 
     @abstractmethod
     def run_mcmc(self, niterations):
         """Run the MCMC for the given number of iterations."""
         pass
 
+    @abstractmethod
+    def write_results(self, filename):
+        """Should write all samples currently in memory to the given file."""
+        pass
+
     def checkpoint(self):
         """Dumps current samples to the checkpoint file."""
         # write new samples
@@ -326,23 +349,26 @@ def checkpoint(self):
         self.write_results(self.checkpoint_file)
         logging.info("Writing to backup file")
         self.write_results(self.backup_file)
-        # compute the acls
+        # check for burn in, compute the acls
         self.acls = None
-        if self.require_indep_samples:
+        if self.burn_in is not None:
+            logging.info("Updating burn in")
+            self.burn_in.evaluate(self.checkpoint_file)
+        # Compute acls; the burn_in test may have calculated an acl and saved
+        # it, in which case we don't need to do it again.
+        if self.acls is None:
             logging.info("Computing acls")
             self.acls = self.compute_acls(self.checkpoint_file)
-        # FIXME:
-        # logging.info("Updating burn in")
-        # burnidx, is_burned_in = burn_in_eval.update(self, fp)
         # write
         for fn in [self.checkpoint_file, self.backup_file]:
             with self.io(fn, "a") as fp:
-                # write the current number of iterations
-                fp.attrs['niterations'] = self.niterations
-                # FIXME:
-                #sampler.write_burn_in_iterations(fp, burnidx, is_burned_in)
+                if self.burn_in is not None:
+                    fp.write_burn_in(self.burn_in)
                 if self.acls is not None:
                     fp.write_acls(acls)
+                # write the current number of iterations
+                fp.attrs['niterations'] = self.niterations
+                fp.attrs['n_indep_samples'] = self.n_indep_samples
         # check validity
         checkpoint_valid = validate_checkpoint_files(
             self.checkpoint_file, self.backup_file)
@@ -408,7 +434,7 @@ def compute_acfs(cls, filename, start_index=None, end_index=None,
             ``nwalkers x niterations``.
         """
         acfs = {}
-        with cls._io(filename, 'r') as fp:
+        with cls.io(filename, 'r') as fp:
             if parameters is None:
                 parameters = fp.variable_params
             if isinstance(parameters, str) or isinstance(parameters, unicode):
@@ -462,7 +488,7 @@ def compute_acls(cls, filename, start_index=None, end_index=None):
             A dictionary giving the ACL for each parameter.
         """
         acls = {}
-        with cls._io(filename, 'r') as fp:
+        with cls.io(filename, 'r') as fp:
             for param in fp.variable_params:
                 samples = fp.read_raw_samples(
                     fp, param, thin_start=start_index, thin_interval=1,
diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py
index 9da1bba..34a09f2 100644
--- a/gwin/sampler/emcee.py
+++ b/gwin/sampler/emcee.py
@@ -33,9 +33,11 @@
 from pycbc.io import FieldArray
 from pycbc.filter import autocorrelation
 from pycbc.pool import choose_pool
+from pycbc.workflow import ConfigParser
 
 from .base import BaseSampler
 from .base_mcmc import (BaseMCMC, raw_samples_to_dict, raw_stats_to_dict)
+from ../ import burn_in
 
 
 #
@@ -63,6 +65,7 @@ class EmceeEnsembleSampler(EnsembleMCMCAutocorrSupport, BaseMCMC, BaseSampler):
     """
     name = "emcee"
     _io = EmceeFile
+    burn_in_class = burn_in.MCMCBurnInTests
 
     def __init__(self, model, nwalkers, logpost_function=None,
                  nprocesses=1, use_mpi=False):
@@ -191,21 +194,10 @@ def write_results(self, filename):
             # write random state
             fp.write_random_state(state=self._sampler.random_state)
 
-
     def finalize(self):
-        """Finalize the samples file."""
-        # Compute/write final ACL
-        acls = self.compute_acls(self.checkpoint_file)
-        # FIXME:
-        # logging.info("Updating burn in")
-        # burnidx, is_burned_in = burn_in_eval.update(self, fp)
-        # write
-        with self.io(self.checkpoint_file, "a") as fp:
-            # write the current number of iterations
-            fp.attrs['niterations'] = self.niterations
-            # FIXME:
-            #sampler.write_burn_in_iterations(fp, burnidx, is_burned_in)
-            fp.write_acls(acls)
+        """All data is written by the last checkpoint in the run method, so
+        this just passes."""
+        pass
 
     @classmethod
     def from_config(cls, cp, model, nprocesses=1, use_mpi=False):
@@ -220,6 +212,12 @@ def from_config(cls, cp, model, nprocesses=1, use_mpi=False):
             lnpost = cp.get(section, "logpost-function")
         else:
             lnpost = None
-        return cls(model, nwalkers, logpost_function=lnpost,
+        obj = cls(model, nwalkers, logpost_function=lnpost,
                    nprocesses=nprocesses, use_mpi=use_mpi)
-
+        # add burn-in if it's specified
+        try:
+            bit = obj.burn_in_class.from_config(cp, obj)
+        except ConfigParser.NoSectionError:
+            bit = None
+        obj.set_burn_in(bit)
+        return obj

From c0eb5c66a1edbf30b6b397c79f48cd00c1d0be91 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Mon, 30 Jul 2018 17:43:06 +0200
Subject: [PATCH 28/47] add thin_start/interval/end to the hdf file attrs

---
 gwin/io/base_hdf.py       | 57 ++++++++++++++++++++++++++++++++-------
 gwin/io/base_mcmc.py      | 10 ++++---
 gwin/sampler/base_mcmc.py |  1 -
 3 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index 06bbe34..007c3bc 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -213,6 +213,41 @@ def n_indep_samples(self):
         except KeyError:
             return 0
 
+    @property
+    def thin_start(self):
+        """The default start index to use when reading samples.
+
+        This tries to read from ``thin_start`` in the ``attrs``. If it isn't
+        there, just returns 0."""
+        try:
+            return self.attrs['thin_start']
+        except KeyError:
+            return 0
+
+    @property
+    def thin_interval(self):
+        """The default interval to use when reading samples.
+
+        This tries to read from ``thin_interval`` in the ``attrs``. If it
+        isn't there, just returns 1.
+        """
+        try:
+            return self.attrs['thin_interval']
+        except KeyError:
+            return 1
+
+    @property
+    def thin_end(self):
+        """The defaut end index to use when reading samples.
+
+        This tries to read from ``thin_end`` in the ``attrs``. If it isn't
+        there, just returns None.
+        """
+        try:
+            return self.attrs['thin_end']
+        except KeyError:
+            return None
+
     @property
     def cmd(self):
         """Returns the (last) saved command line.
@@ -557,7 +592,7 @@ def copy(self, other, ignore=None, parameters=None, parameter_names=None,
             # check that we're not trying to overwrite this file
             if other == self.name:
                 raise IOError("destination is the same as this file")
-            other = InferenceFile(other, 'w')
+            other = self.__class__(other, 'w')
         # metadata
         self.copy_metadata(other)
         # info
@@ -572,15 +607,17 @@ def copy(self, other, ignore=None, parameters=None, parameter_names=None,
                               parameter_names=parameter_names,
                               read_args=read_args,
                               write_args=write_args)
-        # if any down selection was done, re-set the burn in iterations and
-        # the acl, and the niterations.
-        # The last dimension of the samples returned by the sampler should
-        # be the number of iterations.
-        #if samples.shape[-1] != self.niterations:
-        #    other.attrs['acl'] = 1
-        #    other.attrs['burn_in_iterations'] = 0
-        #    other.attrs['niterations'] = samples.shape[-1]
-        #return other
+            # if any down selection was done, re-set the default
+            # thin-start/interval/end
+            p = self[self.samples_group].keys()[0]
+            my_shape = self[self.samples_group][p].shape
+            p = other[other.samples_group].keys()[0]
+            other_shape = other[other.samples_group][p].shape 
+            if my_shape != other_shape:
+                other.attrs['thin_start'] = 0
+                other.attrs['thin_interval'] = 1
+                other.attrs['thin_end'] = None
+        return other
 
 
 def write_kwargs_to_hdf_attrs(attrs, **kwargs):
diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py
index f068d12..651c67b 100644
--- a/gwin/io/base_mcmc.py
+++ b/gwin/io/base_mcmc.py
@@ -196,8 +196,10 @@ def write_acls(self, acls):
                 # dataset doesn't exist yet
                 self[group.format(param)] = acls[param]
         # write the maximum over all params
-        self.attrs['acl'] = numpy.array(acls.values()).max()
-        return self.attrs['acl']
+        acl = numpy.array(acls.values()).max()
+        self.attrs['acl'] = acl
+        # set the default thin interval to be the acl
+        self.attrs['thin_interval'] = acl
 
     def read_acls(self):
         """Reads the acls of all the parameters.
@@ -218,9 +220,11 @@ def read_acls(self):
     def write_burn_in(self, burn_in):
         """Write the given burn-in data to the given filename."""
         group = self[self.sampler_group]
+        group.attrs['burn_in_test'] = burn_in.burn_in_test
         group.attrs['is_burned_in'] = burn_in.is_burned_in
         group.attrs['burn_in_iteration'] = burn_in.burn_in_iteration
-        group.attrs['burn_in_test'] = burn_in.burn_in_test
+        # set the defaut thin_start to be the burn_in_iteration
+        self.attrs['thin_start'] = burn_in.burn_in_iteration
         # write individual test data
         for tst in burn_in.burn_in_data:
             key = 'burn_in_tests/{}'.format(tst)
diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index 120846c..954e40f 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -391,7 +391,6 @@ def compute_acl(cls, filename, **kwargs):
         pass
 
 
-
 class EnsembleMCMCAutocorrSupport(object):
     """Provides class methods for calculating ensemble ACFs/ACLs.
     """

From eead8a805ebea4358a5432b6afd2e2b56ee37143 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Tue, 31 Jul 2018 10:28:00 +0200
Subject: [PATCH 29/47] fix typos, whitespace in burn_in

---
 gwin/burn_in.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/gwin/burn_in.py b/gwin/burn_in.py
index 6e83bb5..f7dca6a 100644
--- a/gwin/burn_in.py
+++ b/gwin/burn_in.py
@@ -30,10 +30,12 @@
 from scipy.stats import ks_2samp
 
 from pycbc.filter import autocorrelation
+from pycbc.io.record import get_vars_from_arg
 
 # The value to use for a burn-in iteration if a chain is not burned in
 NOT_BURNED_IN_ITER = -1
 
+
 #
 # =============================================================================
 #
@@ -41,6 +43,8 @@
 #
 # =============================================================================
 #
+
+
 def ks_test(samples1, samples2, threshold=0.9):
     """Applies a KS test to determine if two sets of samples are the same.
 
@@ -145,7 +149,7 @@ def max_posterior(lnps_per_walker, dim):
     # find the first iteration in each chain where the logpost has exceeded
     # max_p - dim/2
     for ii in range(nwalkers):
-        chain = lnps_per_walker[ii,:]
+        chain = lnps_per_walker[ii, :]
         passedidx = numpy.where(chain >= criteria)[0]
         is_burned_in[ii] = is_burned_in = passedidx.size > 0
         if is_burned_in:
@@ -191,7 +195,6 @@ def posterior_step(logposts, dim):
 # =============================================================================
 #
 
-from pycbc.io.record import get_vars_from_arg
 
 class MCMCBurnInTests(object):
     """Provides methods for estimating burn-in of an ensemble MCMC."""
@@ -260,10 +263,10 @@ def ks_test(self, filename):
         # do the test
         # is_the_same is a dictionary of params --> bool indicating whether or
         # not the 1D marginal is the same at the half way point
-        is_the_same = ks_test(samples1, samples2, threshold=self.ks_threshold) 
+        is_the_same = ks_test(samples1, samples2, threshold=self.ks_threshold)
         data = self.burn_in_data['ks_test']
         # required things to store
-        data['is_burned_in'] = all(is_the_same.values()) 
+        data['is_burned_in'] = all(is_the_same.values())
         if data['is_burned_in']:
             data['burn_in_iteration'] = int(niters/2.)
         else:
@@ -315,5 +318,5 @@ def from_config(cls, cp, sampler):
             kwargs['nacl'] = int(cp.get_opt_tag(section, 'nacl', tag))
         if cp.has_option_tag(section, 'ks-threshold', tag)
             kwargs['ks_threshold'] = float(
-                cp.get_opt_tag(section, 'ks-threshold', tag)           
+                cp.get_opt_tag(section, 'ks-threshold', tag))
         return cls(sampler, burn_in_test, **kwargs)

From e765c129faae4f431f581ec8341c4ce490d220ef Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Tue, 31 Jul 2018 10:33:42 +0200
Subject: [PATCH 30/47] fix whitespace, typos in base_hdf

---
 gwin/io/base_hdf.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index 007c3bc..e3d9a00 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -41,9 +41,10 @@
 
 from .. import sampler as gwin_sampler
 
+
 class BaseInferenceFile(h5py.File):
     """Base class for all inference hdf files.
-    
+
     This is a subclass of the h5py.File object. It adds functions for
     handling reading and writing the samples from the samplers.
 
@@ -67,7 +68,7 @@ def __init__(self, path, mode=None, **kwargs):
 
     def __getattr__(self, attr):
         """Things stored in ``.attrs`` are promoted to instance attributes.
-        
+
         Note that properties will be called before this, so if there are any
         properties that share the same name as something in ``.attrs``, that
         property will get returned.
@@ -162,7 +163,7 @@ def read_samples(self, parameters, array_class=None, **kwargs):
         # convert to FieldArray
         samples = array_class.from_kwargs(**samples)
         # add the static params
-        for p,val in self.static_params.items():
+        for (p, val) in self.static_params.items():
             setattr(samples, p, val)
         return samples
 
@@ -612,7 +613,7 @@ def copy(self, other, ignore=None, parameters=None, parameter_names=None,
             p = self[self.samples_group].keys()[0]
             my_shape = self[self.samples_group][p].shape
             p = other[other.samples_group].keys()[0]
-            other_shape = other[other.samples_group][p].shape 
+            other_shape = other[other.samples_group][p].shape
             if my_shape != other_shape:
                 other.attrs['thin_start'] = 0
                 other.attrs['thin_interval'] = 1
@@ -622,7 +623,7 @@ def copy(self, other, ignore=None, parameters=None, parameter_names=None,
 
 def write_kwargs_to_hdf_attrs(attrs, **kwargs):
     """Writes the given keywords to the given ``attrs``.
-    
+
     If any keyword argument points to a dict, the keyword will point to a
     list of the dict's keys. Each key is then written to the attrs with its
     corresponding value.

From ab40ad04168e4dbb93c278dfc873bbfb4fc51647 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Tue, 31 Jul 2018 10:34:03 +0200
Subject: [PATCH 31/47] rename EnsembleMCMCIO to MCMCIO; fix whitespace

---
 gwin/io/base_mcmc.py | 7 ++++---
 gwin/io/emcee.py     | 5 ++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py
index 651c67b..ef834dc 100644
--- a/gwin/io/base_mcmc.py
+++ b/gwin/io/base_mcmc.py
@@ -40,7 +40,8 @@
 
 from .hdf import InferenceFile
 
-class EnsembleMCMCIO(obect):
+
+class MCMCIO(obect):
     """Abstract base class that provides some IO functions for ensemble MCMCs.
     """
     __metaclass__ = ABCMeta
@@ -113,8 +114,8 @@ def write_samples(self, samples, parameters=None,
             fp[dataset_name][:, istart:istop] = samples[param]
 
     def read_raw_samples(self, fields,
-                           thin_start=None, thin_interval=None, thin_end=None,
-                           iteration=None, walkers=None, flatten=True):
+                         thin_start=None, thin_interval=None, thin_end=None,
+                         iteration=None, walkers=None, flatten=True):
         """Base function for reading samples.
 
         Parameters
diff --git a/gwin/io/emcee.py b/gwin/io/emcee.py
index 2376c64..e2ad663 100644
--- a/gwin/io/emcee.py
+++ b/gwin/io/emcee.py
@@ -27,7 +27,8 @@
 from .base_hdf import BaseInferenceFile
 from .base_mcmc import EnsembleMCMCIO
 
-class EmceeFile(EnsembleMCMCIO, BaseInferenceFile):
+
+class EmceeFile(MCMCIO, BaseInferenceFile):
     """Class to handle file IO for the ``emcee`` sampler."""
 
     name = 'emcee_file'
@@ -69,5 +70,3 @@ def write_acceptance_fraction(self, acceptance_fraction):
         except KeyError:
             # dataset doesn't exist yet, create it
             self[group] = acceptance_fraction
-
-

From ac6d5148d022ffe0e5f103f37fcf672b85a152ea Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Tue, 31 Jul 2018 10:37:49 +0200
Subject: [PATCH 32/47] fix typo

---
 gwin/burn_in.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gwin/burn_in.py b/gwin/burn_in.py
index f7dca6a..afd91a2 100644
--- a/gwin/burn_in.py
+++ b/gwin/burn_in.py
@@ -316,7 +316,7 @@ def from_config(cls, cp, sampler):
         kwargs = {}
         if cp.has_option_tag(section, 'nacl', tag):
             kwargs['nacl'] = int(cp.get_opt_tag(section, 'nacl', tag))
-        if cp.has_option_tag(section, 'ks-threshold', tag)
+        if cp.has_option_tag(section, 'ks-threshold', tag):
             kwargs['ks_threshold'] = float(
                 cp.get_opt_tag(section, 'ks-threshold', tag))
         return cls(sampler, burn_in_test, **kwargs)

From 23366e3eb062f1b357ccd23003a6b88a44c78f13 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Tue, 31 Jul 2018 10:45:49 +0200
Subject: [PATCH 33/47] fix whitespace

---
 gwin/models/gaussian_noise.py |  2 +-
 gwin/sampler/base.py          | 25 +++++++++++++------------
 gwin/sampler/base_mcmc.py     | 10 ++++++----
 gwin/sampler/emcee.py         |  8 ++++----
 gwin/sampler/emcee_pt.py      |  4 +++-
 5 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/gwin/models/gaussian_noise.py b/gwin/models/gaussian_noise.py
index c04dd4c..645dbb1 100644
--- a/gwin/models/gaussian_noise.py
+++ b/gwin/models/gaussian_noise.py
@@ -439,7 +439,7 @@ def det_optimal_snrsq(self, det):
 
     def write_metadata(self, fp):
         """Adds writing the psds and lognl, since it's a constant.
-        
+
         The lognl is written to the sample group's ``attrs``.
         """
         super(GaussianNoise, self).write_data(fp)
diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py
index e0cb543..f0f3b48 100644
--- a/gwin/sampler/base.py
+++ b/gwin/sampler/base.py
@@ -55,7 +55,7 @@ class BaseSampler(object):
     def __init__(self, model):
         self.model = model
 
-    #@classmethod # uncomment when we move to python 3.3
+    # @classmethod <--uncomment when we move to python 3.3
     @abstractmethod
     def from_config(cls, cp, model, nprocesses=1, use_mpi=False,
                     **kwargs):
@@ -85,7 +85,7 @@ def static_params(self):
     def samples(self):
         """A dict mapping variable_params to arrays of samples currently
         in memory. The dictionary may also contain sampling_params.
-        
+
         The sample arrays may have any shape, and may or may not be thinned.
         """
         pass
@@ -102,7 +102,7 @@ def model_stats(self):
     @abstractmethod
     def run(self):
         """This function should run the sampler.
-        
+
         Any checkpointing should be done internally in this function.
         """
         pass
@@ -111,7 +111,7 @@ def run(self):
     def io(self):
         """A class that inherits from ``BaseInferenceFile`` to handle IO with
         an hdf file.
-        
+
         This should be a class, not an instance of class, so that the sampler
         can initialize it when needed.
         """
@@ -121,7 +121,7 @@ def io(self):
     def set_initial_conditions(self, initial_distribution=None,
                                samples_file=None):
         """Sets up the starting point for the sampler.
-        
+
         Should also set the sampler's random state.
         """
         pass
@@ -144,7 +144,7 @@ def write_metadata(self, fp):
         # write the model's metadata
         self.model.write_metadata(fp)
         self._write_more_metadata(fp)
-        
+
     def _write_more_metadata(self, fp):
         """Optional method that can be implemented if a sampler wants to write
         more metadata than just its name and the model's metadata.
@@ -160,7 +160,7 @@ def setup_output(self, output_file, force=False, injection_file=None):
 
         If the output file already exists, an ``OSError`` will be raised.
         This can be overridden by setting ``force`` to ``True``.
-        
+
         Parameters
         ----------
         sampler : sampler instance
@@ -231,7 +231,7 @@ def create_new_output_file(sampler, filename, force=False, injection_file=None,
 
     If the output file already exists, an ``OSError`` will be raised. This can
     be overridden by setting ``force`` to ``True``.
-    
+
     Parameters
     ----------
     sampler : sampler instance
@@ -285,10 +285,11 @@ def intial_dist_from_config(cp):
                      "than the prior.")
         initial_dists = distributions.read_distributions_from_config(
             cp, section="initial")
-        constraints = distributions.read_constraints_from_config(cp,
-            constraint_section="initial_constraint")
-        init_dist = distributions.JointDistribution(sampler.variable_params,
-            *initial_dists, **{"constraints" : constraints})
+        constraints = distributions.read_constraints_from_config(
+            cp, constraint_section="initial_constraint")
+        init_dist = distributions.JointDistribution(
+            sampler.variable_params, *initial_dists,
+            **{"constraints": constraints})
     else:
         init_dist = None
     return init_dist
diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index 954e40f..71b6f8b 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -34,6 +34,8 @@
 #
 # =============================================================================
 #
+
+
 def raw_samples_to_dict(sampler, raw_samples):
     """Convenience function for converting ND array to a dict of samples.
 
@@ -131,11 +133,11 @@ class BaseMCMC(object):
     _nwalkers = None
     _burn_in = None
 
-    @abstractproperty(self):
+    @abstractproperty
     def base_shape(self):
-        """What shape the sampler's samples arrays are in, excluding 
+        """What shape the sampler's samples arrays are in, excluding
         the iterations dimension.
-        
+
         For example, if a sampler uses 20 walkers and 3 temperatures, this
         would be ``(3, 20)``. If a sampler only uses a single walker and no
         temperatures this would be ``()``.
@@ -173,7 +175,7 @@ def pos(self):
     @property
     def p0(self):
         """The starting position of the walkers in the sampling param space.
-        
+
         The returned object is a dict mapping the sampling parameters to the
         values.
         """
diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py
index 34a09f2..a16cde9 100644
--- a/gwin/sampler/emcee.py
+++ b/gwin/sampler/emcee.py
@@ -37,7 +37,7 @@
 
 from .base import BaseSampler
 from .base_mcmc import (BaseMCMC, raw_samples_to_dict, raw_stats_to_dict)
-from ../ import burn_in
+from gwin import burn_in
 
 
 #
@@ -111,7 +111,7 @@ def base_shape(self):
     def samples(self):
         """A dict mapping ``variable_params`` to arrays of samples currently
         in memory.
-        
+
         The arrays have shape ``nwalkers x niterations``.
         """
         # emcee stores samples to it's chain attribute as a
@@ -122,7 +122,7 @@ def samples(self):
     @property
     def model_stats(self):
         """A dict mapping the model's ``default_stats`` to arrays of values.
-        
+
         The returned array has shape ``nwalkers x niterations``.
         """
         return raw_samples_to_dict(self._sampler.blobs, raw_stats)
@@ -213,7 +213,7 @@ def from_config(cls, cp, model, nprocesses=1, use_mpi=False):
         else:
             lnpost = None
         obj = cls(model, nwalkers, logpost_function=lnpost,
-                   nprocesses=nprocesses, use_mpi=use_mpi)
+                  nprocesses=nprocesses, use_mpi=use_mpi)
         # add burn-in if it's specified
         try:
             bit = obj.burn_in_class.from_config(cp, obj)
diff --git a/gwin/sampler/emcee_pt.py b/gwin/sampler/emcee_pt.py
index 8cb6605..cef83fd 100644
--- a/gwin/sampler/emcee_pt.py
+++ b/gwin/sampler/emcee_pt.py
@@ -26,9 +26,11 @@
 packages for parameter estimation.
 """
 
-# This is needed for two reason
+# The following two classes are needed for two reason
 # 1) pools freeze state when created and so classes *cannot be updated*
 # 2) methods cannot be pickled.
+
+
 class _callprior(object):
     """Calls the model's prior function, and ensures that no
     metadata is returned."""

From 60d0e75ca2ddf1edc8b44f30cdf889a6df9aa347 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Tue, 31 Jul 2018 11:48:05 +0200
Subject: [PATCH 34/47] write filetype to inference hdf files; provide a
 loadfile function

---
 gwin/io/__init__.py | 44 +++++++++++++++++++++++++++++++++-
 gwin/io/base_hdf.py | 58 ++++++++++++++++++++++-----------------------
 2 files changed, 72 insertions(+), 30 deletions(-)

diff --git a/gwin/io/__init__.py b/gwin/io/__init__.py
index 2e19621..8b78ce3 100644
--- a/gwin/io/__init__.py
+++ b/gwin/io/__init__.py
@@ -17,5 +17,47 @@
 """I/O utilities for GWIn
 """
 
-from .hdf import InferenceFile
+import h5py as _h5py
+from .emcee import EmceeFile
 from .txt import InferenceTXTFile
+
+filetypes = {
+    EmceeFile.name: EmceeFile,
+}
+
+
+def loadfile(path, mode=None, filetype=None, **kwargs):
+    """Loads the given file using the appropriate InferenceFile class.
+
+    If ``filetype`` is not provided, this will try to retreive the ``filetype``
+    from the file's ``attrs``. If the file does not exist yet, an IOError will
+    be raised if ``filetype`` is not provided.
+
+    Parameters
+    ----------
+    path : str
+        The filename to load.
+    mode : str, optional
+        What mode to load the file with, e.g., 'w' for write, 'r' for read,
+        'a' for append. Default will default to h5py.File's mode, which is 'a'.
+    filetype : str, optional
+        Force the file to be loaded with the given class name. This must be
+        provided if creating a new file.
+
+    Returns
+    -------
+    filetype instance
+        An open file handler to the file. The class used for IO with the file
+        is determined by the ``filetype`` keyword (if provided) or the
+        ``filetype`` stored in the file (if not provided).
+    """
+    if filetype is None:
+        # try to read the file to get its filetype
+        try:
+            with _h5py.File(path, 'r') as fp:
+                filetype = fp.attrs['filetype']
+        except IOError:
+            # file doesn't exist, filetype must be provided
+            raise IOError("The file appears not to exist. In this case, "
+                          "filetype must be provided.")
+    return filetypes[filetype](path, mode=mode, **kwargs)
diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index e3d9a00..75d6c73 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -64,7 +64,23 @@ class BaseInferenceFile(h5py.File):
     injections_group = 'injections'
 
     def __init__(self, path, mode=None, **kwargs):
-        super(BaseInferenceFile, self).__init__(path, mode, **kwargs)
+        fp = super(BaseInferenceFile, self).__init__(path, mode, **kwargs)
+        # check that file type matches self
+        try:
+            filetype = fp.attrs['filetype']
+        except KeyError:
+            if mode == 'w':
+                # first time creating the file, add this class's name
+                filetype = self.name
+                fp.attrs['filetype'] = filetype
+            else:
+                filetype = None
+        if filetype != self.name:
+            raise ValueError("This file has filetype {}, whereas this class "
+                             "is named {}. This indicates that the file was "
+                             "not written by this class, and so cannot be "
+                             "read by this class.".format(filetype, self.name))
+        return fp
 
     def __getattr__(self, attr):
         """Things stored in ``.attrs`` are promoted to instance attributes.
@@ -444,42 +460,26 @@ def get_slice(self, thin_start=None, thin_interval=None, thin_end=None):
 
         Parameters
         ----------
-        thin_start : {None, int}
-            The starting index to use. If None, will try to retrieve the
-            `burn_in_iterations` from the given file. If no
-            `burn_in_iterations` exists, will default to the start of the
-            array.
-        thin_interval : {None, int}
-            The interval to use. If None, will try to retrieve the acl from the
-            given file. If no acl attribute exists, will default to 1.
-        thin_end : {None, int}
-            The end index to use. If None, will retrieve to the end of the
-            array.
+        thin_start : int, optional
+            The starting index to use. If None, will use the ``thin_start``
+            attribute.
+        thin_interval : int, optional
+            The interval to use. If None, will use the ``thin_interval``
+            attribute.
+        thin_end : int, optional
+            The end index to use. If None, will use the ``thin_end`` attribute.
 
         Returns
         -------
         slice :
             The slice needed.
         """
-
-        # default is to skip burn in samples
         if thin_start is None:
-            try:
-                thin_start = self.burn_in_iterations
-                # if the sampler hasn't burned in, the burn_in_iterations will
-                # be the same as the number of iterations, which would result
-                # in 0 samples. In that case, just use the last one
-                if thin_start == self.niterations:
-                    thin_start = thin_start - 1
-            except KeyError:
-                pass
-
-        # default is to use stored ACL and accept every i-th sample
+            thin_start = self.thin_start
         if thin_interval is None:
-            try:
-                thin_interval = int(numpy.ceil(self.acl))
-            except KeyError:
-                pass
+            thin_interval = self.thin_interval
+        if thin_end is None:
+            thin_end = self.thin_end
         return slice(thin_start, thin_end, thin_interval)
 
     def copy_metadata(self, other):

From 704d417c21e15c5ce01ebc0d75314d4492b5283e Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Tue, 31 Jul 2018 11:48:49 +0200
Subject: [PATCH 35/47] fix some import errors

---
 gwin/sampler/base_mcmc.py |  4 ++--
 gwin/sampler/emcee.py     | 11 +++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index 71b6f8b..4dc5c6d 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -23,7 +23,7 @@
 #
 """Provides constructor classes and convenience functions for MCMC samplers."""
 
-from abc import ABCMeta, abstractmethod, abstractproperty
+from abc import (ABCMeta, abstractmethod, abstractproperty)
 import logging
 import numpy
 
@@ -393,7 +393,7 @@ def compute_acl(cls, filename, **kwargs):
         pass
 
 
-class EnsembleMCMCAutocorrSupport(object):
+class MCMCAutocorrSupport(object):
     """Provides class methods for calculating ensemble ACFs/ACLs.
     """
 
diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py
index a16cde9..d935cb3 100644
--- a/gwin/sampler/emcee.py
+++ b/gwin/sampler/emcee.py
@@ -30,14 +30,13 @@
 
 import numpy
 import emcee
-from pycbc.io import FieldArray
-from pycbc.filter import autocorrelation
 from pycbc.pool import choose_pool
 from pycbc.workflow import ConfigParser
 
 from .base import BaseSampler
-from .base_mcmc import (BaseMCMC, raw_samples_to_dict, raw_stats_to_dict)
-from gwin import burn_in
+from .base_mcmc import (BaseMCMC, MCMCAutocorrSupport, raw_samples_to_dict,
+                        raw_stats_to_dict)
+from gwin.burn_in import MCMCBurnInTests
 
 
 #
@@ -48,7 +47,7 @@
 # =============================================================================
 #
 
-class EmceeEnsembleSampler(EnsembleMCMCAutocorrSupport, BaseMCMC, BaseSampler):
+class EmceeEnsembleSampler(MCMCAutocorrSupport, BaseMCMC, BaseSampler):
     """This class is used to construct an MCMC sampler from the emcee
     package's EnsembleSampler.
 
@@ -65,7 +64,7 @@ class EmceeEnsembleSampler(EnsembleMCMCAutocorrSupport, BaseMCMC, BaseSampler):
     """
     name = "emcee"
     _io = EmceeFile
-    burn_in_class = burn_in.MCMCBurnInTests
+    burn_in_class = MCMCBurnInTests
 
     def __init__(self, model, nwalkers, logpost_function=None,
                  nprocesses=1, use_mpi=False):

From adee9c3e6d13c0d0ab6d2026f785a9f9cd269abb Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Tue, 31 Jul 2018 13:46:42 +0200
Subject: [PATCH 36/47] remove sampler_class from io to avoid circular imports

---
 gwin/io/base_hdf.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index 75d6c73..fa3f86c 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -25,6 +25,8 @@
 inference samplers generate.
 """
 
+from __future__ import absolute_import
+
 import os
 import sys
 import logging
@@ -39,8 +41,6 @@
 from pycbc.types import FrequencySeries
 from pycbc.waveform import parameters as wfparams
 
-from .. import sampler as gwin_sampler
-
 
 class BaseInferenceFile(h5py.File):
     """Base class for all inference hdf files.
@@ -205,15 +205,6 @@ def write_posterior(self, posterior_file, **kwargs):
         """
         pass
 
-    @property
-    def sampler_class(self):
-        """Returns the sampler class that was used."""
-        try:
-            sampler = self.sampler_name
-        except KeyError:
-            return None
-        return gwin_sampler.samplers[sampler]
-
     @property
     def static_params(self):
         """Returns a dictionary of the static_params. The keys are the argument

From 36a5e75b66e580f3dff4129342b3ee24b78295c2 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Tue, 31 Jul 2018 22:19:54 +0200
Subject: [PATCH 37/47] fix bugs

---
 bin/gwin                  |  38 +++++-----
 gwin/io/__init__.py       | 144 +++++++++++++++++++++++++++++++++++++-
 gwin/io/base_hdf.py       |  58 ++-------------
 gwin/io/base_mcmc.py      |  44 +++++-------
 gwin/io/emcee.py          |   5 +-
 gwin/models/base.py       |   2 +-
 gwin/option_utils.py      |  82 ----------------------
 gwin/sampler/__init__.py  |  40 +++++++++--
 gwin/sampler/base.py      |  16 +++--
 gwin/sampler/base_mcmc.py |  80 ++++++++++++---------
 gwin/sampler/emcee.py     |  39 ++++++-----
 11 files changed, 302 insertions(+), 246 deletions(-)

diff --git a/bin/gwin b/bin/gwin
index 9bf822c..26ce45b 100644
--- a/bin/gwin
+++ b/bin/gwin
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright (C) 2016 Christopher M. Biwer
+# Copyright (C) 2016 Christopher M. Biwer, Collin Capano
 #
 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by the
@@ -32,8 +32,6 @@ from pycbc.waveform import generator
 
 import gwin
 from gwin import (__version__, burn_in, option_utils)
-from gwin.io.hdf import InferenceFile
-from gwin.option_utils import validate_checkpoint_files
 from gwin.calibration import Recalibrate
 
 # command line usage
@@ -53,6 +51,12 @@ parser.add_argument("--save-backup", action="store_true",
                     default=False,
                     help="Don't delete the backup file after the run has "
                          "completed.")
+# parallelization options
+parser.add_argument("--nprocesses", type=int, default=1,
+                    help="Number of processes to use. If not given then only "
+                         "a single core will be used.")
+parser.add_argument("--use-mpi", action='store_true', default=False,
+                    help="Use MPI to parallelize the sampler")
 # run duration options
 parser.add_argument("--nsamples", type=int, required=True,
                     help="The number of samples the sampler should get. "
@@ -143,8 +147,9 @@ with ctx:
     # get ifo-specific instances of calibration model
     if cp.has_section('calibration'):
         logging.info("Initializing calibration model")
-        recalibration = {ifo: Recalibrate.from_config(cp, ifo, section='calibration') for
-                         ifo in opts.instruments}
+        recalibration = {ifo: Recalibrate.from_config(cp, ifo,
+                                                      section='calibration')
+                         for ifo in opts.instruments}
         model_args['recalibration'] = recalibration
 
     # get gates for templates
@@ -157,10 +162,6 @@ with ctx:
     # construct class that will return the natural logarithm of likelihood
     model = gwin.models.read_from_config(cp, **model_args)
 
-    # FIXME: move to MCMC sampler
-    #burn_in_eval = burn_in.BurnIn(opts.burn_in_function,
-    #                            min_iterations=opts.min_burn_in)
-
     logging.info("Setting up sampler")
 
     # Create sampler that will run.
@@ -176,17 +177,18 @@ with ctx:
     # injection file all detectors. This
     # should be fixed in a future version of PyCBC. Once it is,
     # update this. Until then, just use the first file.
-    injection_file = opts.injection_file.values()[0]  # None if not set
+    if opts.injection_file:
+        injection_file = opts.injection_file.values()[0]  # None if not set
+    else:
+        injection_file = None
     sampler.setup_output(opts.output_file, force=opts.force,
-                         injection_file=injetion_file)
+                         injection_file=injection_file)
 
-    # set the walkers initial positions from a pre-existing InferenceFile
-    # or a specific initial distribution listed in the configuration file
-    # or else use the prior distributions to set initial positions
-    logging.info("Setting walkers initial conditions for varying parameters")
+    # Figure out where to get the initial conditions from: a samples file,
+    # the checkpoint file, the prior, or an initial prior.
     samples_file = opts.samples_file
     # use the checkpoint file instead if resume from checkpoint
-    if sampler.checkpoint_valid:
+    if not sampler.new_checkpoint:
         samples_file = sampler.checkpoint_file
     if samples_file is not None:
         logging.info("Initial positions taken from last iteration in %s",
@@ -194,9 +196,9 @@ with ctx:
         init_prior = None
     else:
         # try to load an initial distribution from the config file
-        init_prior = gwin.sampler.inital_dist_from_config(cp)
+        init_prior = gwin.sampler.initial_dist_from_config(cp)
 
-    sampler.set_initial_conditions(intial_distribution=init_prior,
+    sampler.set_initial_conditions(initial_distribution=init_prior,
         samples_file=samples_file)
 
     # Set the target number of samples for the sampler
diff --git a/gwin/io/__init__.py b/gwin/io/__init__.py
index 8b78ce3..ea519eb 100644
--- a/gwin/io/__init__.py
+++ b/gwin/io/__init__.py
@@ -17,7 +17,13 @@
 """I/O utilities for GWIn
 """
 
+from __future__ import absolute_import
+
+import os
+import shutil
+import logging
 import h5py as _h5py
+
 from .emcee import EmceeFile
 from .txt import InferenceTXTFile
 
@@ -25,7 +31,6 @@
     EmceeFile.name: EmceeFile,
 }
 
-
 def loadfile(path, mode=None, filetype=None, **kwargs):
     """Loads the given file using the appropriate InferenceFile class.
 
@@ -61,3 +66,140 @@ def loadfile(path, mode=None, filetype=None, **kwargs):
             raise IOError("The file appears not to exist. In this case, "
                           "filetype must be provided.")
     return filetypes[filetype](path, mode=mode, **kwargs)
+
+#
+# =============================================================================
+#
+#                         HDF Utilities
+#
+# =============================================================================
+#
+
+
+def check_integrity(filename):
+    """Checks the integrity of an InferenceFile.
+
+    Checks done are:
+
+        * can the file open?
+        * do all of the datasets in the samples group have the same shape?
+        * can the first and last sample in all of the datasets in the samples
+          group be read?
+
+    If any of these checks fail, an IOError is raised.
+
+    Parameters
+    ----------
+    filename: str
+        Name of an InferenceFile to check.
+
+    Raises
+    ------
+    ValueError
+        If the given file does not exist.
+    KeyError
+        If the samples group does not exist.
+    IOError
+        If any of the checks fail.
+    """
+    # check that the file exists
+    if not os.path.exists(filename):
+        raise ValueError("file {} does not exist".format(filename))
+    # if the file is corrupted such that it cannot be opened, the next line
+    # will raise an IOError
+    with loadfile(filename, 'r') as fp:
+        # check that all datasets in samples have the same shape
+        parameters = fp[fp.samples_group].keys()
+        group = fp.samples_group + '/{}'
+        # use the first parameter as a reference shape
+        ref_shape = fp[group.format(parameters[0])].shape
+        if not all(fp[group.format(param)].shape == ref_shape
+                   for param in parameters):
+            raise IOError("not all datasets in the samples group have the "
+                          "same shape")
+        # check that we can read the first/last sample
+        firstidx = tuple([0]*len(ref_shape))
+        lastidx = tuple([-1]*len(ref_shape))
+        for param in parameters:
+            fp[group.format(param)][firstidx]
+            fp[group.format(param)][lastidx]
+
+
+def validate_checkpoint_files(checkpoint_file, backup_file):
+    """Checks if the given checkpoint and/or backup files are valid.
+
+    The checkpoint file is considered valid if:
+
+        * it passes all tests run by ``check_integrity``;
+        * it has at least one sample written to it (indicating at least one
+          checkpoint has happened).
+
+    The same applies to the backup file. The backup file must also have the
+    same number of samples as the checkpoint file, otherwise, the backup is
+    considered invalid.
+
+    If the checkpoint (backup) file is found to be valid, but the backup
+    (checkpoint) file is not valid, then the checkpoint (backup) is copied to
+    the backup (checkpoint). Thus, this function ensures that checkpoint and
+    backup files are either both valid or both invalid.
+
+    Parameters
+    ----------
+    checkpoint_file : string
+        Name of the checkpoint file.
+    backup_file : string
+        Name of the backup file.
+
+    Returns
+    -------
+    checkpoint_valid : bool
+        Whether or not the checkpoint (and backup) file may be used for loading
+        samples.
+    """
+    # check if checkpoint file exists and is valid
+    try:
+        check_integrity(checkpoint_file)
+        checkpoint_valid = True
+    except (ValueError, KeyError, IOError):
+        checkpoint_valid = False
+    # backup file
+    try:
+        check_integrity(backup_file)
+        backup_valid = True
+    except (ValueError, KeyError, IOError):
+        backup_valid = False
+    # check if there are any samples in the file; if not, we'll just start from
+    # scratch
+    if checkpoint_valid:
+        with loadfile(checkpoint_file, 'r') as fp:
+            try:
+                group = '{}/{}'.format(fp.samples_group, fp.variable_params[0])
+                nsamples = fp[group].size
+                checkpoint_valid = nsamples != 0
+            except KeyError:
+                checkpoint_valid = False
+    # check if there are any samples in the backup file
+    if backup_valid:
+        with loadfile(backup_file, 'r') as fp:
+            try:
+                group = '{}/{}'.format(fp.samples_group, fp.variable_params[0])
+                backup_nsamples = fp[group].size
+                backup_valid = backup_nsamples != 0
+            except KeyError:
+                backup_valid = False
+    # check that the checkpoint and backup have the same number of samples;
+    # if not, assume the checkpoint has the correct number
+    if checkpoint_valid and backup_valid:
+        backup_valid = nsamples == backup_nsamples
+    # decide what to do based on the files' statuses
+    if checkpoint_valid and not backup_valid:
+        # copy the checkpoint to the backup
+        logging.info("Backup invalid; copying checkpoint file")
+        shutil.copy(checkpoint_file, backup_file)
+        backup_valid = True
+    elif backup_valid and not checkpoint_valid:
+        logging.info("Checkpoint invalid; copying backup file")
+        # copy the backup to the checkpoint
+        shutil.copy(backup_file, checkpoint_file)
+        checkpoint_valid = True
+    return checkpoint_valid
diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index fa3f86c..8355ecb 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -64,15 +64,15 @@ class BaseInferenceFile(h5py.File):
     injections_group = 'injections'
 
     def __init__(self, path, mode=None, **kwargs):
-        fp = super(BaseInferenceFile, self).__init__(path, mode, **kwargs)
+        super(BaseInferenceFile, self).__init__(path, mode, **kwargs)
         # check that file type matches self
         try:
-            filetype = fp.attrs['filetype']
+            filetype = self.attrs['filetype']
         except KeyError:
             if mode == 'w':
                 # first time creating the file, add this class's name
                 filetype = self.name
-                fp.attrs['filetype'] = filetype
+                self.attrs['filetype'] = filetype
             else:
                 filetype = None
         if filetype != self.name:
@@ -80,7 +80,6 @@ def __init__(self, path, mode=None, **kwargs):
                              "is named {}. This indicates that the file was "
                              "not written by this class, and so cannot be "
                              "read by this class.".format(filetype, self.name))
-        return fp
 
     def __getattr__(self, attr):
         """Things stored in ``.attrs`` are promoted to instance attributes.
@@ -314,7 +313,7 @@ def write_random_state(self, group=None, state=None):
         if state is None:
             state = numpy.random.get_state()
         s, arr, pos, has_gauss, cached_gauss = state
-        if group in self:
+        if dataset_name in self:
             self[dataset_name][:] = arr
         else:
             self.create_dataset(dataset_name, arr.shape, fletcher32=True,
@@ -635,52 +634,3 @@ def write_kwargs_to_hdf_attrs(attrs, **kwargs):
             write_kwargs_to_hdf_attrs(attrs, **val)
         else:
             attrs[arg] = val
-
-
-def check_integrity(filename):
-    """Checks the integrity of an InferenceFile.
-
-    Checks done are:
-
-        * can the file open?
-        * do all of the datasets in the samples group have the same shape?
-        * can the first and last sample in all of the datasets in the samples
-          group be read?
-
-    If any of these checks fail, an IOError is raised.
-
-    Parameters
-    ----------
-    filename: str
-        Name of an InferenceFile to check.
-
-    Raises
-    ------
-    ValueError
-        If the given file does not exist.
-    KeyError
-        If the samples group does not exist.
-    IOError
-        If any of the checks fail.
-    """
-    # check that the file exists
-    if not os.path.exists(filename):
-        raise ValueError("file {} does not exist".format(filename))
-    # if the file is corrupted such that it cannot be opened, the next line
-    # will raise an IOError
-    with InferenceFile(filename, 'r') as fp:
-        # check that all datasets in samples have the same shape
-        parameters = fp[fp.samples_group].keys()
-        group = fp.samples_group + '/{}'
-        # use the first parameter as a reference shape
-        ref_shape = fp[group.format(parameters[0])].shape
-        if not all(fp[group.format(param)].shape == ref_shape
-                   for param in parameters):
-            raise IOError("not all datasets in the samples group have the "
-                          "same shape")
-        # check that we can read the first/last sample
-        firstidx = tuple([0]*len(ref_shape))
-        lastidx = tuple([-1]*len(ref_shape))
-        for param in parameters:
-            fp[group.format(param)][firstidx]
-            fp[group.format(param)][lastidx]
diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py
index ef834dc..d2087d4 100644
--- a/gwin/io/base_mcmc.py
+++ b/gwin/io/base_mcmc.py
@@ -24,24 +24,12 @@
 """Provides I/O that is specific to MCMC samplers.
 """
 
-import os
-import sys
-import logging
-from abc import ABCMeta
+from abc import (ABCMeta, abstractmethod)
 
 import numpy
 
-import h5py
 
-from pycbc import DYN_RANGE_FAC
-from pycbc.io import FieldArray
-from pycbc.types import FrequencySeries
-from pycbc.waveform import parameters as wfparams
-
-from .hdf import InferenceFile
-
-
-class MCMCIO(obect):
+class MCMCIO(object):
     """Abstract base class that provides some IO functions for ensemble MCMCs.
     """
     __metaclass__ = ABCMeta
@@ -82,25 +70,27 @@ def write_samples(self, samples, parameters=None,
             h5py.
         """
         nwalkers, niterations = samples.values()[0].shape
-        assert(all(p.shape == (nwalkers, niterations)
-                   for p in samples.values()),
+        assert all(p.shape == (nwalkers, niterations)
+                   for p in samples.values()), (
                "all samples must have the same shape")
         if max_iterations is not None and max_iterations < niterations:
             raise IndexError("The provided max size is less than the "
                              "number of iterations")
         group = self.samples_group + '/{name}'
+        if parameters is None:
+            parameters = samples.keys()
         # loop over number of dimensions
         for param in parameters:
             dataset_name = group.format(name=param)
             istart = start_iteration
             try:
-                fp_niterations = fp[dataset_name].shape[-1]
+                fp_niterations = self[dataset_name].shape[-1]
                 if istart is None:
                     istart = fp_niterations
                 istop = istart + niterations
                 if istop > fp_niterations:
                     # resize the dataset
-                    fp[dataset_name].resize(istop, axis=1)
+                    self[dataset_name].resize(istop, axis=1)
             except KeyError:
                 # dataset doesn't exist yet
                 if istart is not None and istart != 0:
@@ -108,10 +98,10 @@ def write_samples(self, samples, parameters=None,
                                      "but dataset doesn't exist yet")
                 istart = 0
                 istop = istart + niterations
-                fp.create_dataset(dataset_name, (nwalkers, istop),
-                                  maxshape=(nwalkers, max_iterations),
-                                  dtype=float, fletcher32=True)
-            fp[dataset_name][:, istart:istop] = samples[param]
+                self.create_dataset(dataset_name, (nwalkers, istop),
+                                    maxshape=(nwalkers, max_iterations),
+                                    dtype=float, fletcher32=True)
+            self[dataset_name][:, istart:istop] = samples[param]
 
     def read_raw_samples(self, fields,
                          thin_start=None, thin_interval=None, thin_end=None,
@@ -139,16 +129,14 @@ def read_raw_samples(self, fields,
         if iteration is not None:
             get_index = iteration
         else:
-            if thin_end is None:
-                # use the number of current iterations
-                thin_end = fp.niterations
-            get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end,
-                                     thin_interval=thin_interval)
+            get_index = self.get_slice(thin_start=thin_start,
+                                       thin_end=thin_end,
+                                       thin_interval=thin_interval)
         # load
         group = self.samples_group + '/{name}'
         arrays = {}
         for name in fields:
-            arr = fp[group.format(name=name)][widx, get_index]
+            arr = self[group.format(name=name)][widx, get_index]
             if flatten:
                 arr = arr.flatten()
             arrays[name] = arr
diff --git a/gwin/io/emcee.py b/gwin/io/emcee.py
index e2ad663..8331226 100644
--- a/gwin/io/emcee.py
+++ b/gwin/io/emcee.py
@@ -25,7 +25,7 @@
 """
 
 from .base_hdf import BaseInferenceFile
-from .base_mcmc import EnsembleMCMCIO
+from .base_mcmc import MCMCIO
 
 
 class EmceeFile(MCMCIO, BaseInferenceFile):
@@ -70,3 +70,6 @@ def write_acceptance_fraction(self, acceptance_fraction):
         except KeyError:
             # dataset doesn't exist yet, create it
             self[group] = acceptance_fraction
+
+    def write_posterior(self, filename, **kwargs):
+        pass
diff --git a/gwin/models/base.py b/gwin/models/base.py
index d5a3d5e..9c4598c 100644
--- a/gwin/models/base.py
+++ b/gwin/models/base.py
@@ -746,7 +746,7 @@ def from_config(cls, cp, **kwargs):
 
     def write_metadata(self, fp):
         """Writes metadata to the given file handler."""
-        fp.attrs['model'] = sampler.model.name
+        fp.attrs['model'] = self.name
         fp.attrs['variable_params'] = list(self.variable_params)
         fp.attrs['sampling_params'] = list(self.sampling_params)
         write_kwargs_to_hdf_attrs(fp.attrs, static_params=self.static_params)
diff --git a/gwin/option_utils.py b/gwin/option_utils.py
index 5fe539e..47ff79c 100644
--- a/gwin/option_utils.py
+++ b/gwin/option_utils.py
@@ -191,86 +191,6 @@ def sampler_from_cli(opts, model, pool=None):
 #
 # -----------------------------------------------------------------------------
 
-def validate_checkpoint_files(checkpoint_file, backup_file):
-    """Checks if the given checkpoint and/or backup files are valid.
-
-    The checkpoint file is considered valid if:
-
-        * it passes all tests run by ``InferenceFile.check_integrity``;
-        * it has at least one sample written to it (indicating at least one
-          checkpoint has happened).
-
-    The same applies to the backup file. The backup file must also have the
-    same number of samples as the checkpoint file, otherwise, the backup is
-    considered invalid.
-
-    If the checkpoint (backup) file is found to be valid, but the backup
-    (checkpoint) file is not valid, then the checkpoint (backup) is copied to
-    the backup (checkpoint). Thus, this function ensures that checkpoint and
-    backup files are either both valid or both invalid.
-
-    Parameters
-    ----------
-    checkpoint_file : string
-        Name of the checkpoint file.
-    backup_file : string
-        Name of the backup file.
-
-    Returns
-    -------
-    checkpoint_valid : bool
-        Whether or not the checkpoint (and backup) file may be used for loading
-        samples.
-    """
-    # check if checkpoint file exists and is valid
-    logging.info("Validating checkpoint and backup files")
-    try:
-        check_integrity(checkpoint_file)
-        checkpoint_valid = True
-    except (ValueError, KeyError, IOError):
-        checkpoint_valid = False
-    # backup file
-    try:
-        check_integrity(backup_file)
-        backup_valid = True
-    except (ValueError, KeyError, IOError):
-        backup_valid = False
-    # check if there are any samples in the file; if not, we'll just start from
-    # scratch
-    if checkpoint_valid:
-        with InferenceFile(checkpoint_file, 'r') as fp:
-            try:
-                group = '{}/{}'.format(fp.samples_group, fp.variable_params[0])
-                nsamples = fp[group].size
-                checkpoint_valid = nsamples != 0
-            except KeyError:
-                checkpoint_valid = False
-    # check if there are any samples in the backup file
-    if backup_valid:
-        with InferenceFile(backup_file, 'r') as fp:
-            try:
-                group = '{}/{}'.format(fp.samples_group, fp.variable_params[0])
-                backup_nsamples = fp[group].size
-                backup_valid = backup_nsamples != 0
-            except KeyError:
-                backup_valid = False
-    # check that the checkpoint and backup have the same number of samples;
-    # if not, assume the checkpoint has the correct number
-    if checkpoint_valid and backup_valid:
-        backup_valid = nsamples == backup_nsamples
-    # decide what to do based on the files' statuses
-    if checkpoint_valid and not backup_valid:
-        # copy the checkpoint to the backup
-        logging.info("Backup invalid; copying checkpoint file")
-        shutil.copy(checkpoint_file, backup_file)
-        backup_valid = True
-    elif backup_valid and not checkpoint_valid:
-        logging.info("Checkpoint invalid; copying backup file")
-        # copy the backup to the checkpoint
-        shutil.copy(backup_file, checkpoint_file)
-        checkpoint_valid = True
-    return checkpoint_valid
-
 
 def add_low_frequency_cutoff_opt(parser):
     """Adds the low-frequency-cutoff option to the given parser."""
@@ -325,7 +245,6 @@ def data_from_cli(opts):
                                              precision="double")
     # apply gates if not waiting to overwhiten
     if not opts.gate_overwhitened:
-        logging.info("Applying gates to strain data")
         strain_dict = apply_gates_to_td(strain_dict, gates)
 
     # get strain time series to use for PSD estimation
@@ -350,7 +269,6 @@ def data_from_cli(opts):
 
     # FFT strain and save each of the length of the FFT, delta_f, and
     # low frequency cutoff to a dict
-    logging.info("FFT strain")
     stilde_dict = {}
     length_dict = {}
     delta_f_dict = {}
diff --git a/gwin/sampler/__init__.py b/gwin/sampler/__init__.py
index 6154aee..5b6e435 100644
--- a/gwin/sampler/__init__.py
+++ b/gwin/sampler/__init__.py
@@ -17,14 +17,42 @@
 This modules provides a list of implemented samplers for parameter estimation.
 """
 
-from .kombine import KombineSampler
-from .emcee import (EmceeEnsembleSampler, EmceePTSampler)
-from .mcmc import MCMCSampler
+from __future__ import absolute_import
+
+from .base import (initial_dist_from_config, create_new_output_file)
+# from .kombine import KombineSampler
+from .emcee import (EmceeEnsembleSampler, ) # EmceePTSampler)
+# from .mcmc import MCMCSampler
 
 # list of available samplers
 samplers = {cls.name: cls for cls in (
-    KombineSampler,
+    #KombineSampler,
     EmceeEnsembleSampler,
-    EmceePTSampler,
-    MCMCSampler,
+    #EmceePTSampler,
+    #MCMCSampler,
 )}
+
+
+def load_from_config(cp, model, **kwargs):
+    """Loads a sampler from the given config file.
+
+    This looks for a name in the section ``[sampler]`` to determine which
+    sampler class to load. That sampler's ``from_config`` is then called.
+
+    Parameters
+    ----------
+    cp : WorkflowConfigParser
+        Config parser to read from.
+    model : gwin.model
+        Which model to pass to the sampler.
+    \**kwargs :
+        All other keyword arguments are passed directly to the sampler's
+        ``from_config`` file.
+    
+    Returns
+    -------
+    sampler :
+        The initialized sampler.
+    """
+    name = cp.get('sampler', 'name')
+    return samplers[name].from_config(cp, model, **kwargs)
diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py
index f0f3b48..af041fa 100644
--- a/gwin/sampler/base.py
+++ b/gwin/sampler/base.py
@@ -26,12 +26,15 @@
 """
 
 from abc import ABCMeta, abstractmethod, abstractproperty
+import os
 import numpy
+import shutil
 from pycbc.io import FieldArray
 from pycbc.filter import autocorrelation
 import h5py
 import logging
 
+from ..io import validate_checkpoint_files
 
 #
 # =============================================================================
@@ -176,20 +179,23 @@ def setup_output(self, output_file, force=False, injection_file=None):
         checkpoint_file = output_file + '.checkpoint'
         backup_file = output_file + '.bkup'
         # check if we have a good checkpoint and/or backup file
+        logging.info("Looking for checkpoint file")
         checkpoint_valid = validate_checkpoint_files(checkpoint_file,
                                                      backup_file)
         # Create a new file if the checkpoint doesn't exist, or if it is
         # corrupted
+        self.new_checkpoint = False # keeps track if this is a new file or not
         if not checkpoint_valid:
-            self.create_new_output_file(checkpoint_file, force=force,
-                                        injection_file=injection_file)
+            logging.info("Checkpoint not found or not valid")
+            create_new_output_file(self, checkpoint_file, force=force,
+                                   injection_file=injection_file)
             # now the checkpoint is valid
-            checkpoint_valid = True
+            self.new_checkpoint = True
             # copy to backup
             shutil.copy(checkpoint_file, backup_file)
         # write the command line
         for fn in [checkpoint_file, backup_file]:
-            with sampler.io(fn, "a") as fp:
+            with self.io(fn, "a") as fp:
                 fp.write_command_line()
         # store
         self.checkpoint_file = checkpoint_file
@@ -263,7 +269,7 @@ def create_new_output_file(sampler, filename, force=False, injection_file=None,
             fp.write_injections(injection_file)
 
 
-def intial_dist_from_config(cp):
+def initial_dist_from_config(cp):
     """Loads a distribution for the sampler start from the given config file.
 
     A distribution will only be loaded if the config file has a [initial-*]
diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index 4dc5c6d..6462ad8 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -23,9 +23,14 @@
 #
 """Provides constructor classes and convenience functions for MCMC samplers."""
 
+from __future__ import absolute_import
+
 from abc import (ABCMeta, abstractmethod, abstractproperty)
 import logging
 import numpy
+from pycbc.filter import autocorrelation
+
+from ..io import validate_checkpoint_files
 
 #
 # =============================================================================
@@ -65,7 +70,10 @@ def raw_samples_to_dict(sampler, raw_samples):
     samples = sampler.model.prior_distribution.apply_boundary_conditions(
         **samples)
     # apply transforms to go to model's variable params space
-    return sampler.model.sampling_transforms.apply(samples, inverse=True)
+    if sampler.model.sampling_transforms is not None:
+        samples = sampler.model.sampling_transforms.apply(
+            samples, inverse=True)
+    return samples
 
 
 def raw_stats_to_dict(sampler, raw_stats):
@@ -132,6 +140,7 @@ class BaseMCMC(object):
     _p0 = None
     _nwalkers = None
     _burn_in = None
+    _checkpoint_interval = None
 
     @abstractproperty
     def base_shape(self):
@@ -155,13 +164,23 @@ def nwalkers(self):
     def niterations(self):
         """Get the current number of iterations."""
         itercounter = self._itercounter
-        if _itercounter is None:
+        if itercounter is None:
             itercounter = 0
         lastclear = self._lastclear
         if lastclear is None:
             lastclear = 0
         return itercounter + lastclear
 
+    @property
+    def checkpoint_interval(self):
+        """The number of iterations to do between checkpoints."""
+        return self._checkpoint_interval
+
+    @abstractmethod
+    def clear_samples(self):
+        """A method to clear samples from memory."""
+        pass
+
     @property
     def pos(self):
         pos = self._pos
@@ -209,19 +228,20 @@ def set_p0(self, samples_file=None, prior=None):
                 samples = fp.read_samples(self.variable_params,
                                           iteration=-1)
                 # make sure we have the same shape
-                assert(samples.shape[:-1] == self.samples_shape,
+                assert samples.shape == self.base_shape, (
                        "samples in file {} have shape {}, but I have shape {}".
-                       format(samples_file, samples.shape, self.samples_shape))
+                       format(samples_file, samples.shape, self.base_shape))
             # transform to sampling parameter space
-            samples = self.model.sampling_transforms.apply(samples)
+            if self.model.sampling_transforms is not None:
+                samples = self.model.sampling_transforms.apply(samples)
         # draw random samples if samples are not provided
         else:
-            nsamples = numpy.prod(self.samples_shape)
+            nsamples = numpy.prod(self.base_shape)
             samples = self.model.prior_rvs(size=nsamples, prior=prior).reshape(
-                self.samples_shape)
-        # store as ND array with shape [samples_shape] x nparams
+                self.base_shape)
+        # store as ND array with shape [base_shape] x nparams
         ndim = len(self.variable_params)
-        p0 = numpy.ones(list(self.samples_shape)+[ndim])
+        p0 = numpy.ones(list(self.base_shape)+[ndim])
         for i, param in enumerate(self.sampling_params):
             p0[..., i] = samples[param]
         self._p0 = p0
@@ -246,12 +266,6 @@ def set_state_from_file(self, filename):
         """
         pass
 
-    @abstractmethod
-    def write_state(self, filename):
-        """Saves the state of the sampler to the given file.
-        """
-        pass
-
     def run(self):
         """Runs the sampler."""
 
@@ -266,7 +280,7 @@ def run(self):
         # contains (either due to sampler burn-in, or a previous checkpoint)
         try:
             with self.io(self.checkpoint_file, "r") as fp:
-                start = fp.niterations
+                startiter = fp.niterations
         except KeyError:
             startiter = 0
         if self.require_indep_samples:
@@ -312,7 +326,7 @@ def run(self):
                 nsamples += iterinterval * self.nwalkers
             self._itercounter = startiter = enditer
 
-    @propetry
+    @property
     def burn_in(self):
         """The class for doing burn-in tests (if specified)."""
         return self._burn_in
@@ -321,6 +335,7 @@ def set_burn_in(self, burn_in):
         """Sets the object to use for doing burn-in tests."""
         self._burn_in = burn_in
 
+    @property
     def n_indep_samples(self):
         """The number of independent samples post burn-in that the sampler has
         acquired so far."""
@@ -360,25 +375,26 @@ def checkpoint(self):
         # it, in which case we don't need to do it again.
         if self.acls is None:
             logging.info("Computing acls")
-            self.acls = self.compute_acls(self.checkpoint_file)
+            self.acls = self.compute_acl(self.checkpoint_file)
         # write
         for fn in [self.checkpoint_file, self.backup_file]:
             with self.io(fn, "a") as fp:
                 if self.burn_in is not None:
                     fp.write_burn_in(self.burn_in)
                 if self.acls is not None:
-                    fp.write_acls(acls)
+                    fp.write_acls(self.acls)
                 # write the current number of iterations
                 fp.attrs['niterations'] = self.niterations
                 fp.attrs['n_indep_samples'] = self.n_indep_samples
         # check validity
+        logging.info("Validating checkpoint and backup files")
         checkpoint_valid = validate_checkpoint_files(
             self.checkpoint_file, self.backup_file)
         if not checkpoint_valid:
             raise IOError("error writing to checkpoint file")
         # clear the in-memory chain to save memory
-        logging.info("Clearing chain")
-        self.clear_chain()
+        logging.info("Clearing samples from memory")
+        self.clear_samples()
 
     @abstractmethod
     def compute_acf(cls, filename, **kwargs):
@@ -398,8 +414,8 @@ class MCMCAutocorrSupport(object):
     """
 
     @classmethod
-    def compute_acfs(cls, filename, start_index=None, end_index=None,
-                     per_walker=False, walkers=None, parameters=None):
+    def compute_acf(cls, filename, start_index=None, end_index=None,
+                    per_walker=False, walkers=None, parameters=None):
         """Computes the autocorrleation function of the model params in the
         given file.
 
@@ -435,7 +451,7 @@ def compute_acfs(cls, filename, start_index=None, end_index=None,
             ``nwalkers x niterations``.
         """
         acfs = {}
-        with cls.io(filename, 'r') as fp:
+        with cls._io(filename, 'r') as fp:
             if parameters is None:
                 parameters = fp.variable_params
             if isinstance(parameters, str) or isinstance(parameters, unicode):
@@ -446,15 +462,15 @@ def compute_acfs(cls, filename, start_index=None, end_index=None,
                     if walkers is None:
                         walkers = numpy.arange(fp.nwalkers)
                     arrays = [
-                        cls.compute_acfs(filename, start_index=start_index,
-                                         end_index=end_index,
-                                         per_walker=False, walkers=ii,
-                                         parameters=param)[param]
+                        cls.compute_acf(filename, start_index=start_index,
+                                        end_index=end_index,
+                                        per_walker=False, walkers=ii,
+                                        parameters=param)[param]
                         for ii in walkers]
                     acfs[param] = numpy.vstack(arrays)
                 else:
                     samples = fp.read_raw_samples(
-                        fp, param, thin_start=start_index, thin_interval=1,
+                        param, thin_start=start_index, thin_interval=1,
                         thin_end=end_index, walkers=walkers,
                         flatten=False)[param]
                     samples = samples.mean(axis=0)
@@ -463,7 +479,7 @@ def compute_acfs(cls, filename, start_index=None, end_index=None,
         return acfs
 
     @classmethod
-    def compute_acls(cls, filename, start_index=None, end_index=None):
+    def compute_acl(cls, filename, start_index=None, end_index=None):
         """Computes the autocorrleation length for all model params in the
         given file.
 
@@ -489,10 +505,10 @@ def compute_acls(cls, filename, start_index=None, end_index=None):
             A dictionary giving the ACL for each parameter.
         """
         acls = {}
-        with cls.io(filename, 'r') as fp:
+        with cls._io(filename, 'r') as fp:
             for param in fp.variable_params:
                 samples = fp.read_raw_samples(
-                    fp, param, thin_start=start_index, thin_interval=1,
+                    param, thin_start=start_index, thin_interval=1,
                     thin_end=end_index, flatten=False)[param]
                 samples = samples.mean(axis=0)
                 acl = autocorrelation.calculate_acl(samples)
diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py
index d935cb3..1888c67 100644
--- a/gwin/sampler/emcee.py
+++ b/gwin/sampler/emcee.py
@@ -36,7 +36,9 @@
 from .base import BaseSampler
 from .base_mcmc import (BaseMCMC, MCMCAutocorrSupport, raw_samples_to_dict,
                         raw_stats_to_dict)
-from gwin.burn_in import MCMCBurnInTests
+from ..burn_in import MCMCBurnInTests
+from ..io import EmceeFile
+from .. import models
 
 
 #
@@ -66,8 +68,8 @@ class EmceeEnsembleSampler(MCMCAutocorrSupport, BaseMCMC, BaseSampler):
     _io = EmceeFile
     burn_in_class = MCMCBurnInTests
 
-    def __init__(self, model, nwalkers, logpost_function=None,
-                 nprocesses=1, use_mpi=False):
+    def __init__(self, model, nwalkers, checkpoint_interval=None,
+                 logpost_function=None, nprocesses=1, use_mpi=False):
 
         self.model = model
         # create a wrapper for calling the model
@@ -93,6 +95,7 @@ def __init__(self, model, nwalkers, logpost_function=None,
         # to have the same state as the numpy generator
         rstate = numpy.random.get_state()
         self._sampler.random_state = rstate
+        self._checkpoint_interval = checkpoint_interval
 
     @property
     def io(self):
@@ -124,7 +127,11 @@ def model_stats(self):
 
         The returned array has shape ``nwalkers x niterations``.
         """
-        return raw_samples_to_dict(self._sampler.blobs, raw_stats)
+        raw_stats = numpy.array(self._sampler.blobs)
+        # raw_stats has shape niterations x nwalkers x nstats; transpose
+        # so that it has shape nwalkers x niterations x nstats
+        raw_stats = raw_stats.transpose((1, 0, 2))
+        return raw_samples_to_dict(self, raw_stats)
 
     def clear_samples(self):
         """Clears the samples and stats from memory.
@@ -154,20 +161,10 @@ def run_mcmc(self, niterations, **kwargs):
             Number of iterations to run the sampler for.
         \**kwargs :
             All other keyword arguments are passed to the emcee sampler.
-
-        Returns
-        -------
-        p : numpy.array
-            An array of current walker positions with shape (nwalkers, ndim).
-        lnpost : numpy.array
-            The list of log posterior probabilities for the walkers at
-            positions p, with shape (nwalkers, ndim).
-        rstate :
-            The current state of the random number generator.
         """
         pos = self._pos
         if pos is None:
-            pos = self.p0
+            pos = self._p0
         res = self._sampler.run_mcmc(pos, niterations, **kwargs)
         p, _, _ = res[0], res[1], res[2]
         # update the positions
@@ -207,16 +204,22 @@ def from_config(cls, cp, model, nprocesses=1, use_mpi=False):
             "name in section [sampler] must match mine")
         # get the number of walkers to use
         nwalkers = int(cp.get(section, "nwalkers"))
+        # get the checkpoint interval, if it's specified
+        if cp.has_option(section, "checkpoint-interval"):
+            checkpoint_interval = int(cp.get(section, "checkpoint-interval"))
+        else:
+            checkpoint_interval = None
         if cp.has_option(section, "logpost-function"):
             lnpost = cp.get(section, "logpost-function")
         else:
             lnpost = None
-        obj = cls(model, nwalkers, logpost_function=lnpost,
-                  nprocesses=nprocesses, use_mpi=use_mpi)
+        obj = cls(model, nwalkers, checkpoint_interval=checkpoint_interval,
+                  logpost_function=lnpost, nprocesses=nprocesses,
+                  use_mpi=use_mpi)
         # add burn-in if it's specified
         try:
             bit = obj.burn_in_class.from_config(cp, obj)
-        except ConfigParser.NoSectionError:
+        except ConfigParser.Error:
             bit = None
         obj.set_burn_in(bit)
         return obj

From e87158234c514c35ad018ec5d0cad53ae86e6bea Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Fri, 3 Aug 2018 10:58:01 +0200
Subject: [PATCH 38/47] fix bugs, move niterations/nsamples into config file

---
 bin/gwin                  |  15 ------
 gwin/io/base_hdf.py       |  28 ++++++++--
 gwin/io/base_mcmc.py      |  22 +++++++-
 gwin/sampler/base.py      |  38 ++------------
 gwin/sampler/base_mcmc.py | 104 ++++++++++++++++++++++++--------------
 gwin/sampler/emcee.py     |  17 +++++--
 6 files changed, 126 insertions(+), 98 deletions(-)

diff --git a/bin/gwin b/bin/gwin
index 26ce45b..cacded7 100644
--- a/bin/gwin
+++ b/bin/gwin
@@ -57,18 +57,6 @@ parser.add_argument("--nprocesses", type=int, default=1,
                          "a single core will be used.")
 parser.add_argument("--use-mpi", action='store_true', default=False,
                     help="Use MPI to parallelize the sampler")
-# run duration options
-parser.add_argument("--nsamples", type=int, required=True,
-                    help="The number of samples the sampler should get. "
-                         "The sampler will run until it has acquired at least "
-                         "this many samples. Depending on checkpoint settings "
-                         "it may go over.")
-parser.add_argument("--require-indep-samples", action="store_true",
-                    default=False,
-                    help="Require that the number of samples set by nsamples "
-                         "be independent. If this is not set, MCMC samplers "
-                         "will just run until they have the desried number of "
-                         "raw samples (with no thinning).")
 parser.add_argument("--samples-file", default=None,
                     help="Use an iteration from an InferenceFile as the "
                          "initial proposal distribution. The same "
@@ -201,9 +189,6 @@ with ctx:
     sampler.set_initial_conditions(initial_distribution=init_prior,
         samples_file=samples_file)
 
-    # Set the target number of samples for the sampler
-    sampler.set_target(opts.nsamples, opts.require_indep_samples)
-
     # Run the sampler
     sampler.run()
 
diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index 8355ecb..f52bf02 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -107,6 +107,14 @@ def write_samples(self, samples, **kwargs):
         """
         pass
 
+    @abstractmethod
+    def write_sampler_metadata(self, sampler):
+        """This should write the given sampler's metadata to the file.
+
+        This should also include the model's metadata.
+        """
+        pass
+
     def parse_parameters(self, parameters, array_class=None):
         """Parses a parameters arg to figure out what fields need to be loaded.
 
@@ -212,14 +220,18 @@ def static_params(self):
         return {arg: self.attrs[arg] for arg in self.attrs["static_params"]}
 
     @property
-    def n_indep_samples(self):
-        """Returns the number of independent samples stored in the file.
+    def effective_nsamples(self):
+        """Returns the effective number of samples stored in the file.
         """
         try:
-            return self.attrs['n_indep_samples']
+            return self.attrs['effective_nsamples']
         except KeyError:
             return 0
 
+    def write_effective_nsamples(self, effective_nsamples):
+        """Writes the effective number of samples stored in the file."""
+        self.attrs['effective_nsamples'] = effective_nsamples
+
     @property
     def thin_start(self):
         """The default start index to use when reading samples.
@@ -444,6 +456,16 @@ def write_command_line(self):
             previous = []
         self.attrs["cmd"] = cmd + previous
 
+    @abstractmethod
+    def write_resume_point(self):
+        """Should write the point that a sampler starts up.
+        
+        How the resume point is indexed is up to the sampler. For example,
+        MCMC samplers use the number of iterations that are stored in the
+        checkpoint file.
+        """
+        pass
+
     def get_slice(self, thin_start=None, thin_interval=None, thin_end=None):
         """Formats a slice using the given arguments that can be used to
         retrieve a thinned array from an InferenceFile.
diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py
index d2087d4..d9a401a 100644
--- a/gwin/io/base_mcmc.py
+++ b/gwin/io/base_mcmc.py
@@ -156,6 +156,26 @@ def write_resume_point(self):
         resume_pts.append(niterations)
         self.attrs["resume_points"] = resume_pts
 
+    def write_niterations(self, niterations):
+        """Writes the given number of iterations to the sampler group."""
+        self[self.sampler_group].attrs['niterations'] = niterations
+
+    @property
+    def niterations(self):
+        """Returns the number of iterations the sampler was run for."""
+        return self[self.sampler_group].attrs['niterations']
+
+    def write_sampler_metadata(self, sampler):
+        """Writes the sampler's metadata."""
+        self.attrs['sampler'] = sampler.name
+        if self.sampler_group not in self.keys():
+            # create the sampler group
+            self.create_group(self.sampler_group)
+        self[self.sampler_group].attrs['nwalkers'] = sampler.nwalkers
+        # write the model's metadata
+        sampler.model.write_metadata(self)
+        
+
     def write_acls(self, acls):
         """Writes the given autocorrelation lengths.
 
@@ -186,7 +206,7 @@ def write_acls(self, acls):
                 self[group.format(param)] = acls[param]
         # write the maximum over all params
         acl = numpy.array(acls.values()).max()
-        self.attrs['acl'] = acl
+        self[self.sampler_group].attrs['acl'] = acl
         # set the default thin interval to be the acl
         self.attrs['thin_interval'] = acl
 
diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py
index af041fa..86971f0 100644
--- a/gwin/sampler/base.py
+++ b/gwin/sampler/base.py
@@ -141,19 +141,6 @@ def finalize(self):
         """Do any finalization to the samples file before exiting."""
         pass
 
-    def write_metadata(self, fp):
-        """Writes metadata about the sampler to the given filehandler."""
-        fp.attrs['sampler'] = self.name
-        # write the model's metadata
-        self.model.write_metadata(fp)
-        self._write_more_metadata(fp)
-
-    def _write_more_metadata(self, fp):
-        """Optional method that can be implemented if a sampler wants to write
-        more metadata than just its name and the model's metadata.
-        """
-        pass
-
     def setup_output(self, output_file, force=False, injection_file=None):
         """Sets up the sampler's checkpoint and output files.
 
@@ -193,35 +180,16 @@ def setup_output(self, output_file, force=False, injection_file=None):
             self.new_checkpoint = True
             # copy to backup
             shutil.copy(checkpoint_file, backup_file)
-        # write the command line
+        # write the command line, startup
         for fn in [checkpoint_file, backup_file]:
             with self.io(fn, "a") as fp:
                 fp.write_command_line()
+                fp.write_resume_point()
         # store
         self.checkpoint_file = checkpoint_file
         self.backup_file = backup_file
         self.checkpoint_valid = checkpoint_valid
 
-    def set_target(self, nsamples, require_independent=False):
-        """Sets the number of samples the sampler should try to acquire.
-
-        If the ``must_be_independent`` flag is set, then the number of samples
-        must be independent. This means, for example, that MCMC chains are
-        thinned by their ACL before counting samples. Otherwise, the sampler
-        will just run until it has the requested number of samples, regardless
-        of thinning.
-
-        Parameters
-        ----------
-        nsamples : int
-            The number of samples to acquire.
-        must_be_independent : bool, optional
-            Add the requirement that the target number of samples be
-            independent. Default is False.
-        """
-        self.target_nsamples = nsamples
-        self.require_indep_samples = require_independent
-
 
 #
 # =============================================================================
@@ -261,7 +229,7 @@ def create_new_output_file(sampler, filename, force=False, injection_file=None,
     logging.info("Creating file {}".format(filename))
     with sampler.io(filename, "w") as fp:
         # save the sampler's metadata
-        sampler.write_metadata(fp)
+        fp.write_sampler_metadata(sampler)
         # save injection parameters
         if injection_file is not None:
             logging.info("Writing injection file to output")
diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index 6462ad8..efaa16f 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -105,7 +105,7 @@ def raw_stats_to_dict(sampler, raw_stats):
         # therefore immediately convert this to a ND array.
         raw_stats = numpy.array(raw_stats)
     return {stat: raw_stats[..., ii]
-            for (ii, stat) in enumerate(self.model.default_stats)}
+            for (ii, stat) in enumerate(sampler.model.default_stats)}
 
 #
 # =============================================================================
@@ -134,13 +134,15 @@ class BaseMCMC(object):
     """
     __metaclass__ = ABCMeta
 
-    _lastclear = None
-    _itercounter = None
+    _lastclear = None # the iteration when samples were cleared from memory
+    _itercounter = None # the number of iterations since the last clear
     _pos = None
     _p0 = None
     _nwalkers = None
     _burn_in = None
     _checkpoint_interval = None
+    _target_niterations = None
+    _target_eff_nsamples = None
 
     @abstractproperty
     def base_shape(self):
@@ -176,6 +178,32 @@ def checkpoint_interval(self):
         """The number of iterations to do between checkpoints."""
         return self._checkpoint_interval
 
+    @property
+    def target_niterations(self):
+        """The number of iterations the sampler should run for."""
+        return self._target_niterations
+
+    @property
+    def target_eff_nsamples(self):
+        """The target number of effective samples the sampler should get."""
+        return self._target_eff_nsamples
+
+    def set_target(self, niterations=None, eff_nsamples=None):
+        """Sets the target niterations/nsamples for the sampler.
+
+        One or the other must be provided, not both.
+        """
+        if niterations is None and eff_nsamples is None:
+            raise ValueError("Must provide a target niterations or "
+                             "eff_nsamples")
+        if niterations is not None and eff_nsamples is not None:
+            raise ValueError("Must provide a target niterations or "
+                             "eff_nsamples, not both")
+        self._target_niterations = int(niterations) \
+            if niterations is not None else None
+        self._target_eff_nsamples = int(eff_nsamples) \
+            if eff_nsamples is not None else None
+
     @abstractmethod
     def clear_samples(self):
         """A method to clear samples from memory."""
@@ -268,63 +296,61 @@ def set_state_from_file(self, filename):
 
     def run(self):
         """Runs the sampler."""
-
-        if self.require_indep_samples and self.checkpoint_interval is None:
+        if self.target_eff_nsamples and self.checkpoint_interval is None:
             raise ValueError("A checkpoint interval must be set if "
-                             "independent samples are required")
+                             "targetting an effective number of samples")
         # get the starting number of samples:
         # "nsamples" keeps track of the number of samples we've obtained (if
-        # require_indep_samples is used, this is the number of independent
+        # target_eff_nsamples is not None, this is the effective number of
         # samples; otherwise, this is the total number of samples).
-        # "startiter" is the number of iterations that the file already
+        # _lastclear is the number of iterations that the file already
         # contains (either due to sampler burn-in, or a previous checkpoint)
-        try:
+        if self.new_checkpoint:
+            self._lastclear = 0
+        else:
             with self.io(self.checkpoint_file, "r") as fp:
-                startiter = fp.niterations
-        except KeyError:
-            startiter = 0
-        if self.require_indep_samples:
+                self._lastclear = fp.niterations
+        if self.target_eff_nsamples is not None:
+            target_nsamples = self.target_eff_nsamples
             with self.io(self.checkpoint_file, "r") as fp:
-                nsamples = fp.n_indep_samples
-        else:
+                nsamples = fp.effective_nsamples
+        elif self.target_niterations is not None:
             # the number of samples is the number of iterations times the
             # number of walkers
-            nsamples = startiter * self.nwalkers
-        # to ensure iterations are counted properly, the sampler's lastclear
-        # should be the same as start
-        self._lastclear = startiter
-        # keep track of the number of iterations we've done
-        self._itercounter = startiter
+            target_nsamples = self.nwalkers * self.target_niterations
+            nsamples = self._lastclear * self.nwalkers
+        else:
+            raise ValueError("must set either target_eff_nsamples or "
+                             "target_niterations; see set_target")
+        self._itercounter = 0
         # figure out the interval to use
         iterinterval = self.checkpoint_interval
         if iterinterval is None:
-            iterinterval = int(numpy.ceil(
-                float(self.target_nsamples) / self.nwalkers))
+            iterinterval = self.target_niterations
         # run sampler until we have the desired number of samples
-        while nsamples < self.target_nsamples:
-            enditer = startiter + iterinterval
+        while nsamples < target_nsamples:
             # adjust the interval if we would go past the number of iterations
-            endnsamp = enditer * self.nwalkers
-            if endnsamp > self.target_nsamples \
-                    and not self.require_indep_samples:
-                iterinterval = int(numpy.ceil(
-                    (endnsamp - self.target_nsamples) / self.nwalkers))
+            if self.target_niterations is not None and (
+                    self.niterations + iterinterval > self.target_niterations):
+                iterinterval = self.target_niterations - self.niterations
             # run sampler and set initial values to None so that sampler
             # picks up from where it left off next call
             logging.info("Running sampler for {} to {} iterations".format(
-                startiter, enditer))
+                self.niterations, self.niterations + iterinterval))
             # run the underlying sampler for the desired interval
             self.run_mcmc(iterinterval)
+            # update the itercounter
+            #startiter = startiter + iterinterval
+            self._itercounter = self._itercounter + iterinterval
             # dump the current results
             self.checkpoint()
             # update nsamples for next loop
-            if self.require_indep_samples:
-                nsamples = self.n_indep_samples
-                logging.info("Have {} independent samples post burn in".format(
+            if self.target_eff_nsamples is not None:
+                nsamples = self.effective_nsamples
+                logging.info("Have {} effective samples post burn in".format(
                     nsamples))
             else:
                 nsamples += iterinterval * self.nwalkers
-            self._itercounter = startiter = enditer
 
     @property
     def burn_in(self):
@@ -336,8 +362,8 @@ def set_burn_in(self, burn_in):
         self._burn_in = burn_in
 
     @property
-    def n_indep_samples(self):
-        """The number of independent samples post burn-in that the sampler has
+    def effective_nsamples(self):
+        """The effective number of samples post burn-in that the sampler has
         acquired so far."""
         if self.acls is None:
             acl = numpy.inf
@@ -384,8 +410,8 @@ def checkpoint(self):
                 if self.acls is not None:
                     fp.write_acls(self.acls)
                 # write the current number of iterations
-                fp.attrs['niterations'] = self.niterations
-                fp.attrs['n_indep_samples'] = self.n_indep_samples
+                fp.write_niterations(self.niterations)
+                fp.write_effective_nsamples(self.effective_nsamples)
         # check validity
         logging.info("Validating checkpoint and backup files")
         checkpoint_valid = validate_checkpoint_files(
diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py
index 1888c67..443f89d 100644
--- a/gwin/sampler/emcee.py
+++ b/gwin/sampler/emcee.py
@@ -101,10 +101,6 @@ def __init__(self, model, nwalkers, checkpoint_interval=None,
     def io(self):
         return self._io
 
-    def _write_more_metadata(self, fp):
-        """Adds nwalkers to the metadata."""
-        fp.attrs['nwalkers'] = self.nwalkers
-
     @property
     def base_shape(self):
         return (self.nwalkers,)
@@ -131,13 +127,14 @@ def model_stats(self):
         # raw_stats has shape niterations x nwalkers x nstats; transpose
         # so that it has shape nwalkers x niterations x nstats
         raw_stats = raw_stats.transpose((1, 0, 2))
-        return raw_samples_to_dict(self, raw_stats)
+        return raw_stats_to_dict(self, raw_stats)
 
     def clear_samples(self):
         """Clears the samples and stats from memory.
         """
         # store the iteration that the clear is occuring on
         self._lastclear = self.niterations
+        self._itercounter = 0
         # now clear the chain
         self._sampler.reset()
         self._sampler.clear_blobs()
@@ -216,6 +213,16 @@ def from_config(cls, cp, model, nprocesses=1, use_mpi=False):
         obj = cls(model, nwalkers, checkpoint_interval=checkpoint_interval,
                   logpost_function=lnpost, nprocesses=nprocesses,
                   use_mpi=use_mpi)
+        # get target
+        if cp.has_option(section, "niterations"):
+            niterations = int(cp.get(section, "niterations"))
+        else:
+            niterations = None
+        if cp.has_option(section, "effective-nsamples"):
+            nsamples = int(cp.get(section, "effective-nsamples"))
+        else:
+            nsamples = None
+        obj.set_target(niterations=niterations, eff_nsamples=nsamples)
         # add burn-in if it's specified
         try:
             bit = obj.burn_in_class.from_config(cp, obj)

From 9046567b5263cbc7034d88de680975f932e0a559 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Fri, 3 Aug 2018 18:30:26 +0200
Subject: [PATCH 39/47] add halfchain, posterior_step, min_iterations back to
 burn_in

---
 gwin/burn_in.py | 143 ++++++++++++++++++++++++++++++------------------
 1 file changed, 89 insertions(+), 54 deletions(-)

diff --git a/gwin/burn_in.py b/gwin/burn_in.py
index afd91a2..28aa8df 100644
--- a/gwin/burn_in.py
+++ b/gwin/burn_in.py
@@ -79,45 +79,6 @@ def ks_test(samples1, samples2, threshold=0.9):
     return is_the_same
 
 
-def n_acl(chain, nacls=5):
-    """Burn in based on ACL.
-
-    This applies the following test to determine burn in:
-
-    1. The first half of the chain is ignored.
-
-    2. An ACL is calculated from the second half.
-
-    3. If ``nacls`` times the ACL is < the number of iterations / 2,
-       the chain is considered to be burned in at the half-way point.
-
-    Parameters
-    ----------
-    chain : array
-        The chain of samples to apply the test to. Must be 1D.
-    nacls : int, optional
-        Number of ACLs to use for burn in. Default is 5.
-
-    Returns
-    -------
-    burn_in_idx : int
-        The burn in index. If the chain is not burned in, will be equal to the
-        length of the chain.
-    is_burned_in : bool
-        Whether or not the chain is burned in.
-    acl : int
-        The ACL that was estimated.
-    """
-    kstart = int(len(chain)/2.)
-    acl = autocorrelation.calculate_acl(chain[kstart:])
-    is_burned_in = nacls * acl < kstart
-    if is_burned_in:
-        burn_in_idx = kstart
-    else:
-        burn_in_idx = NOT_BURNED_IN_ITER
-    return burn_in_idx, is_burned_in, acl
-
-
 def max_posterior(lnps_per_walker, dim):
     """Burn in based on samples being within dim/2 of maximum posterior.
 
@@ -126,7 +87,7 @@ def max_posterior(lnps_per_walker, dim):
     lnps_per_walker : 2D array
         Array of values that are proportional to the log posterior values. Must
         have shape ``nwalkers x niterations``.
-    dim : float
+    dim : int
         The dimension of the parameter space.
 
     Returns
@@ -166,7 +127,7 @@ def posterior_step(logposts, dim):
     ----------
     logposts : array
         1D array of values that are proportional to the log posterior values.
-    dim : float
+    dim : int
         The dimension of the parameter space.
 
     Returns
@@ -199,6 +160,10 @@ def posterior_step(logposts, dim):
 class MCMCBurnInTests(object):
     """Provides methods for estimating burn-in of an ensemble MCMC."""
 
+    available_tests = ('halfchain', 'min_iterations', 'max_posterior',
+                       'posterior_step', 'nacl', 'ks_test',
+                       )
+
     def __init__(self, sampler, burn_in_test, **kwargs):
         self.sampler = sampler
         # determine the burn-in tests that are going to be done
@@ -207,21 +172,62 @@ def __init__(self, sampler, burn_in_test, **kwargs):
         self.burn_in_data = {t: {} for t in self.do_tests}
         self.is_burned_in = False
         self.burn_in_iteration = None
-        if 'nacl' in burn_in_tests:
-            # get the number of acls to use
-            self._nacls = int(kwargs.pop('nacls', 5))
-        if 'ks_test' in burn_in_tests:
-            self._ksthreshold = float(kwargs.pop('ks_threshold', 0.9))
-
-    def max_posterior(self, filename):
-        """Applies max posterior test to self."""
-        with sampler.io(filename, 'r') as fp:
+        # Arguments specific to each test...
+        # for nacl:
+        self._nacls = int(kwargs.pop('nacls', 5))
+        # for kstest:
+        self._ksthreshold = float(kwargs.pop('ks_threshold', 0.9))
+        # for max_posterior and posterior_step
+        self._ndim = int(kwargs.pop('ndim', len(sampler.variable_args)))
+        # for min iterations
+        self._min_iterations = int(kwargs.pop('min_iterations', 0))
+
+    def _getlogposts(self, filename):
+        """Convenience function for retrieving log posteriors.
+        
+        Parameters
+        ----------
+        filename : str
+            The file to read.
+
+        Returns
+        -------
+        array
+            The log posterior values. They are not flattened, so have dimension
+            nwalkers x niterations.
+        """
+        with self.sampler.io(filename, 'r') as fp:
             samples = fp.read_raw_samples(
                 ['loglikelihood', 'logprior'], thin_start=0, thin_interval=1,
                 flatten=False)
             logposts = samples['loglikelihood'] + samples['logprior']
+        return logposts
+
+    def halfchain(self, filename):
+        """Just uses half the chain as the burn-in iteration.
+        """
+        with self.sampler.io(filename, 'r') as fp:
+            niters = fp.niterations
+        data = self.burn_in_data['halfchain']
+        # this test cannot determine when something will burn in
+        # only when it was not burned in in the past
+        data['is_burned_in'] = True
+        data['burn_in_iteration'] = niters/2
+
+    def min_iterations(self, filename):
+        """Just checks that the sampler has been run for the minimum number
+        of iterations.
+        """
+        with self.sampler.io(filename, 'r') as fp:
+            niters = fp.niterations
+        data = self.burn_in_data['min_iterations']
+        data['is_burned_in'] = niters >= self._min_iterations
+        data['burn_in_iteration'] = self._min_iterations
+    def max_posterior(self, filename):
+        """Applies max posterior test to self."""
+        logposts = self._getlogposts(filename)
         burn_in_idx, is_burned_in = burn_in.max_posterior(
-            logposts, len(self.variable_params))
+            logposts, self._ndim)
         data = self.burn_in_data['max_posterior']
         # required things to store
         data['is_burned_in'] = is_burned_in.all()
@@ -230,9 +236,32 @@ def max_posterior(self, filename):
         data['iteration_per_walker'] = burn_in_idx
         data['status_per_walker'] = is_burned_in
 
+    def posterior_step(self, filename):
+        """Applies the posterior-step test."""
+        logposts = self._getlogposts(filename)
+        burn_in_idx = numpy.array([posterior_step(logps, self._ndim)
+                                   for logps in logposts])
+        data = self.burn_in_data['posterior_step']
+        # this test cannot determine when something will burn in
+        # only when it was not burned in in the past
+        data['is_burned_in'] = True
+        data['burn_in_iteration'] = burn_in_idx.max()
+        # additional info
+        data['iteration_per_walker'] = burn_in_idx
+
     def nacl(self, filename):
-        """Applies the nacl burn-in test"""
-        with sampler.io(filename, 'r') as fp:
+        """Burn in based on ACL.
+
+        This applies the following test to determine burn in:
+
+        1. The first half of the chain is ignored.
+
+        2. An ACL is calculated from the second half.
+
+        3. If ``nacls`` times the ACL is < the number of iterations / 2,
+           the chain is considered to be burned in at the half-way point.
+        """
+        with self.sampler.io(filename, 'r') as fp:
             niters = fp.niterations
         kstart = int(niters / 2.)
         acls = sampler.compute_acls(filename, start_index=kstart)
@@ -252,7 +281,7 @@ def nacl(self, filename):
 
     def ks_test(self, filename):
         """Applies ks burn-in test."""
-        with sampler.io(filename, 'r') as fp:
+        with self.sampler.io(filename, 'r') as fp:
             niters = fp.niterations
             # get the samples from the mid point
             samples1 = fp.read_raw_samples(
@@ -319,4 +348,10 @@ def from_config(cls, cp, sampler):
         if cp.has_option_tag(section, 'ks-threshold', tag):
             kwargs['ks_threshold'] = float(
                 cp.get_opt_tag(section, 'ks-threshold', tag))
+        if cp.has_option_tag(section, 'ndim', tag):
+            kwargs['ndim'] = int(
+                cp.get_opt_tag(section, 'ndim', tag))
+        if cp.has_option_tag(section, 'min-iterations', tag):
+            kwargs['min_iterations'] = int(
+                cp.get_opt_tag(section, 'min-iterations', tag))
         return cls(sampler, burn_in_test, **kwargs)

From 7254c84400ebb86f6f44637f0d08aac5adcf0da8 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Sat, 4 Aug 2018 14:24:58 +0200
Subject: [PATCH 40/47] fix bugs to get acl working post burn in

---
 gwin/burn_in.py           | 47 +++++++++++++++++++++++++--------------
 gwin/io/base_mcmc.py      |  9 +++++---
 gwin/sampler/base_mcmc.py | 47 ++++++++++++++++++++++++++-------------
 3 files changed, 68 insertions(+), 35 deletions(-)

diff --git a/gwin/burn_in.py b/gwin/burn_in.py
index 28aa8df..ac7ad43 100644
--- a/gwin/burn_in.py
+++ b/gwin/burn_in.py
@@ -112,8 +112,8 @@ def max_posterior(lnps_per_walker, dim):
     for ii in range(nwalkers):
         chain = lnps_per_walker[ii, :]
         passedidx = numpy.where(chain >= criteria)[0]
-        is_burned_in[ii] = is_burned_in = passedidx.size > 0
-        if is_burned_in:
+        is_burned_in[ii] = passedidx.size > 0
+        if is_burned_in[ii]:
             burn_in_idx[ii] = passedidx[0]
         else:
             burn_in_idx[ii] = NOT_BURNED_IN_ITER
@@ -171,17 +171,29 @@ def __init__(self, sampler, burn_in_test, **kwargs):
         self.burn_in_test = burn_in_test
         self.burn_in_data = {t: {} for t in self.do_tests}
         self.is_burned_in = False
-        self.burn_in_iteration = None
+        self.burn_in_iteration = NOT_BURNED_IN_ITER
         # Arguments specific to each test...
         # for nacl:
         self._nacls = int(kwargs.pop('nacls', 5))
         # for kstest:
         self._ksthreshold = float(kwargs.pop('ks_threshold', 0.9))
         # for max_posterior and posterior_step
-        self._ndim = int(kwargs.pop('ndim', len(sampler.variable_args)))
+        self._ndim = int(kwargs.pop('ndim', len(sampler.variable_params)))
         # for min iterations
         self._min_iterations = int(kwargs.pop('min_iterations', 0))
 
+    def _getniters(self, filename):
+        """Convenience function to get the number of iterations in the file.
+
+        If `niterations` hasn't been written to the file yet, just returns 0.
+        """
+        with self.sampler.io(filename, 'r') as fp:
+            try:
+                niters = fp.niterations
+            except KeyError:
+                niters = 0
+        return niters
+
     def _getlogposts(self, filename):
         """Convenience function for retrieving log posteriors.
         
@@ -206,8 +218,7 @@ def _getlogposts(self, filename):
     def halfchain(self, filename):
         """Just uses half the chain as the burn-in iteration.
         """
-        with self.sampler.io(filename, 'r') as fp:
-            niters = fp.niterations
+        niters = self._getniters(filename)
         data = self.burn_in_data['halfchain']
         # this test cannot determine when something will burn in
         # only when it was not burned in in the past
@@ -218,20 +229,22 @@ def min_iterations(self, filename):
         """Just checks that the sampler has been run for the minimum number
         of iterations.
         """
-        with self.sampler.io(filename, 'r') as fp:
-            niters = fp.niterations
+        niters = self._getniters(filename)
         data = self.burn_in_data['min_iterations']
         data['is_burned_in'] = niters >= self._min_iterations
         data['burn_in_iteration'] = self._min_iterations
+
     def max_posterior(self, filename):
         """Applies max posterior test to self."""
         logposts = self._getlogposts(filename)
-        burn_in_idx, is_burned_in = burn_in.max_posterior(
-            logposts, self._ndim)
+        burn_in_idx, is_burned_in = max_posterior(logposts, self._ndim)
         data = self.burn_in_data['max_posterior']
         # required things to store
         data['is_burned_in'] = is_burned_in.all()
-        data['burn_in_iteration'] = burn_in_idx.max()
+        if data['is_burned_in']:
+            data['burn_in_iteration'] = burn_in_idx.max()
+        else:
+            data['burn_in_iteration'] = NOT_BURNED_IN_ITER
         # additional info
         data['iteration_per_walker'] = burn_in_idx
         data['status_per_walker'] = is_burned_in
@@ -261,8 +274,7 @@ def nacl(self, filename):
         3. If ``nacls`` times the ACL is < the number of iterations / 2,
            the chain is considered to be burned in at the half-way point.
         """
-        with self.sampler.io(filename, 'r') as fp:
-            niters = fp.niterations
+        niters = self._getniters(filename)
         kstart = int(niters / 2.)
         acls = sampler.compute_acls(filename, start_index=kstart)
         is_burned_in = {param: (self._nacls * acl) < kstart
@@ -305,7 +317,7 @@ def ks_test(self, filename):
 
     def evaluate(self, filename):
         """Runs all of the burn-in tests."""
-        for tst in self.tests_to_do:
+        for tst in self.do_tests:
             getattr(self, tst)(filename)
         # The iteration to use for burn-in depends on the logic in the burn-in
         # test string. For example, if the test was 'max_posterior | nacl' and
@@ -319,12 +331,13 @@ def evaluate(self, filename):
         # by that point. Then evaluate the burn-in string at that point to see
         # if it passes, and if so, what the iteration is. The first point that
         # the test passes is used as the burn-in iteration.
-        burn_in_iters = numpy.unique([self.data[t]['burn_in_iteration']
+        data = self.burn_in_data
+        burn_in_iters = numpy.unique([data[t]['burn_in_iteration']
                                       for t in self.do_tests])
         burn_in_iters.sort()
         for ii in burn_in_iters:
-            test_results = {t: (self.data[t]['is_burned_in'] &
-                                self.data[t]['burn_in_iteration'] <= ii)
+            test_results = {t: (data[t]['is_burned_in'] &
+                                0 <= data[t]['burn_in_iteration'] <= ii)
                             for t in self.do_tests}
             is_burned_in = eval(self.burn_in_test, {"__builtins__": None},
                                 test_results)
diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py
index d9a401a..b306260 100644
--- a/gwin/io/base_mcmc.py
+++ b/gwin/io/base_mcmc.py
@@ -24,10 +24,12 @@
 """Provides I/O that is specific to MCMC samplers.
 """
 
+from __future__ import absolute_import
+
 from abc import (ABCMeta, abstractmethod)
 
 import numpy
-
+from .base_hdf import write_kwargs_to_hdf_attrs
 
 class MCMCIO(object):
     """Abstract base class that provides some IO functions for ensemble MCMCs.
@@ -207,8 +209,9 @@ def write_acls(self, acls):
         # write the maximum over all params
         acl = numpy.array(acls.values()).max()
         self[self.sampler_group].attrs['acl'] = acl
-        # set the default thin interval to be the acl
-        self.attrs['thin_interval'] = acl
+        # set the default thin interval to be the acl (if it is finite)
+        if numpy.isfinite(acl):
+            self.attrs['thin_interval'] = acl
 
     def read_acls(self):
         """Reads the acls of all the parameters.
diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index efaa16f..71f479a 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -365,15 +365,20 @@ def set_burn_in(self, burn_in):
     def effective_nsamples(self):
         """The effective number of samples post burn-in that the sampler has
         acquired so far."""
-        if self.acls is None:
-            acl = numpy.inf
-        else:
+        try:
             acl = numpy.array(self.acls.values()).max()
+        except (AttributeError, TypeError):
+            acl = numpy.inf
         if self.burn_in is None:
             niters = self.niterations
+        elif not self.burn_in.is_burned_in:
+            nperwalker = 0
         else:
-            niters = self.niterations - self.burn_in.burn_in_iteration
-        return self.nwalkers * int(niters // acl)
+            nperwalker = int(
+                (self.niterations - self.burn_in.burn_in_iteration) // acl)
+            # after burn in, we always have atleast 1 sample per walker
+            nperwalker = max(nperwalker, 1)
+        return self.nwalkers * nperwalker
 
     @abstractmethod
     def run_mcmc(self, niterations):
@@ -388,20 +393,27 @@ def write_results(self, filename):
     def checkpoint(self):
         """Dumps current samples to the checkpoint file."""
         # write new samples
-        logging.info("Writing samples to file")
-        self.write_results(self.checkpoint_file)
-        logging.info("Writing to backup file")
-        self.write_results(self.backup_file)
+        logging.info("Writing samples to files")
+        for fn in [self.checkpoint_file, self.backup_file]:
+            self.write_results(fn)
+            with self.io(fn, "a") as fp:
+                # write the current number of iterations
+                fp.write_niterations(self.niterations)
         # check for burn in, compute the acls
         self.acls = None
         if self.burn_in is not None:
             logging.info("Updating burn in")
             self.burn_in.evaluate(self.checkpoint_file)
+            burn_in_iter = self.burn_in.burn_in_iteration
+            logging.info("Is burned in: {}".format(self.burn_in.is_burned_in))
+        else:
+            burn_in_iter = 0
         # Compute acls; the burn_in test may have calculated an acl and saved
         # it, in which case we don't need to do it again.
         if self.acls is None:
             logging.info("Computing acls")
-            self.acls = self.compute_acl(self.checkpoint_file)
+            self.acls = self.compute_acl(self.checkpoint_file,
+                                         start_index=burn_in_iter)
         # write
         for fn in [self.checkpoint_file, self.backup_file]:
             with self.io(fn, "a") as fp:
@@ -409,8 +421,7 @@ def checkpoint(self):
                     fp.write_burn_in(self.burn_in)
                 if self.acls is not None:
                     fp.write_acls(self.acls)
-                # write the current number of iterations
-                fp.write_niterations(self.niterations)
+                # write effective number of samples
                 fp.write_effective_nsamples(self.effective_nsamples)
         # check validity
         logging.info("Validating checkpoint and backup files")
@@ -537,8 +548,14 @@ def compute_acl(cls, filename, start_index=None, end_index=None):
                     param, thin_start=start_index, thin_interval=1,
                     thin_end=end_index, flatten=False)[param]
                 samples = samples.mean(axis=0)
-                acl = autocorrelation.calculate_acl(samples)
-                if numpy.isinf(acl):
-                    acl = samples.size
+                # if < 10 samples, just set to inf
+                # Note: this should be done inside of pycbc's autocorrelation
+                # function
+                if samples.size < 10:
+                    acl = numpy.inf
+                else:
+                    acl = autocorrelation.calculate_acl(samples)
+                if acl <= 0:
+                    acl = numpy.inf
                 acls[param] = acl
         return acls

From 7f0952ec919a8c181752405bf4e0ebdfeaa6f6da Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Sat, 4 Aug 2018 14:34:52 +0200
Subject: [PATCH 41/47] fix bugs in nacl burn in test

---
 gwin/burn_in.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gwin/burn_in.py b/gwin/burn_in.py
index ac7ad43..40244c8 100644
--- a/gwin/burn_in.py
+++ b/gwin/burn_in.py
@@ -276,7 +276,7 @@ def nacl(self, filename):
         """
         niters = self._getniters(filename)
         kstart = int(niters / 2.)
-        acls = sampler.compute_acls(filename, start_index=kstart)
+        acls = self.sampler.compute_acl(filename, start_index=kstart)
         is_burned_in = {param: (self._nacls * acl) < kstart
                         for (param, acl) in acls.items()}
         data = self.burn_in_data['nacl']
@@ -289,7 +289,7 @@ def nacl(self, filename):
         # additional information
         data['status_per_parameter'] = is_burned_in
         # since we calculated it, save the acls to the sampler
-        sampler.acls = acls
+        self.sampler.acls = acls
 
     def ks_test(self, filename):
         """Applies ks burn-in test."""

From 67e188cdbebf1ebfe87813f60a0688307243128e Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Sat, 4 Aug 2018 14:36:14 +0200
Subject: [PATCH 42/47] write more information to the logging messages

---
 gwin/sampler/base_mcmc.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index 71f479a..dbc1ff8 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -406,6 +406,9 @@ def checkpoint(self):
             self.burn_in.evaluate(self.checkpoint_file)
             burn_in_iter = self.burn_in.burn_in_iteration
             logging.info("Is burned in: {}".format(self.burn_in.is_burned_in))
+            if self.burn_in.is_burned_in:
+                logging.info("Burn-in iteration: {}".format(
+                    self.burn_in.burn_in_iteration))
         else:
             burn_in_iter = 0
         # Compute acls; the burn_in test may have calculated an acl and saved
@@ -414,6 +417,7 @@ def checkpoint(self):
             logging.info("Computing acls")
             self.acls = self.compute_acl(self.checkpoint_file,
                                          start_index=burn_in_iter)
+        logging.info("ACL: {}".format(numpy.array(self.acls.values()).max()))
         # write
         for fn in [self.checkpoint_file, self.backup_file]:
             with self.io(fn, "a") as fp:

From a73008b0d35ae83b24049297766abc22ba439c9f Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Sat, 4 Aug 2018 14:48:49 +0200
Subject: [PATCH 43/47] fix bugs in min_iterations burn-in test

---
 gwin/burn_in.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/gwin/burn_in.py b/gwin/burn_in.py
index 40244c8..f29e76e 100644
--- a/gwin/burn_in.py
+++ b/gwin/burn_in.py
@@ -74,7 +74,7 @@ def ks_test(samples1, samples2, threshold=0.9):
     for param in samples1:
         s1 = samples1[param]
         s2 = samples2[param]
-        _, p_value = ks_2samp(samples_last_iter, samples_chain_midpt)
+        _, p_value = ks_2samp(s1, s2)
         is_the_same[param] = p_value > threshold
     return is_the_same
 
@@ -231,8 +231,11 @@ def min_iterations(self, filename):
         """
         niters = self._getniters(filename)
         data = self.burn_in_data['min_iterations']
-        data['is_burned_in'] = niters >= self._min_iterations
-        data['burn_in_iteration'] = self._min_iterations
+        data['is_burned_in'] = self._min_iterations < niters
+        if data['is_burned_in']:
+            data['burn_in_iteration'] = self._min_iterations
+        else:
+            data['burn_in_iteration'] = NOT_BURNED_IN_ITER
 
     def max_posterior(self, filename):
         """Applies max posterior test to self."""
@@ -304,7 +307,7 @@ def ks_test(self, filename):
         # do the test
         # is_the_same is a dictionary of params --> bool indicating whether or
         # not the 1D marginal is the same at the half way point
-        is_the_same = ks_test(samples1, samples2, threshold=self.ks_threshold)
+        is_the_same = ks_test(samples1, samples2, threshold=self._ksthreshold)
         data = self.burn_in_data['ks_test']
         # required things to store
         data['is_burned_in'] = all(is_the_same.values())

From f6e1d5b15620a38ae2078d65bcb3c7189ae75bb4 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Sat, 4 Aug 2018 15:04:25 +0200
Subject: [PATCH 44/47] fix more bugs

---
 gwin/sampler/base_mcmc.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index dbc1ff8..a87fa1e 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -370,14 +370,14 @@ def effective_nsamples(self):
         except (AttributeError, TypeError):
             acl = numpy.inf
         if self.burn_in is None:
-            niters = self.niterations
-        elif not self.burn_in.is_burned_in:
-            nperwalker = 0
-        else:
+            nperwalker = max(int(self.niterations // acl), 1)
+        elif self.burn_in.is_burned_in:
             nperwalker = int(
                 (self.niterations - self.burn_in.burn_in_iteration) // acl)
             # after burn in, we always have atleast 1 sample per walker
             nperwalker = max(nperwalker, 1)
+        else:
+            nperwalker = 0
         return self.nwalkers * nperwalker
 
     @abstractmethod

From a257aed9971e24e385a2041edd2d1eea047b5012 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Sat, 4 Aug 2018 15:12:26 +0200
Subject: [PATCH 45/47] fix pep8 issues

---
 gwin/burn_in.py           |  2 +-
 gwin/io/__init__.py       |  1 +
 gwin/io/base_hdf.py       |  2 +-
 gwin/io/base_mcmc.py      |  2 +-
 gwin/sampler/__init__.py  | 11 ++++++-----
 gwin/sampler/base.py      |  3 ++-
 gwin/sampler/base_mcmc.py |  5 ++---
 7 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/gwin/burn_in.py b/gwin/burn_in.py
index f29e76e..d87bf69 100644
--- a/gwin/burn_in.py
+++ b/gwin/burn_in.py
@@ -196,7 +196,7 @@ def _getniters(self, filename):
 
     def _getlogposts(self, filename):
         """Convenience function for retrieving log posteriors.
-        
+
         Parameters
         ----------
         filename : str
diff --git a/gwin/io/__init__.py b/gwin/io/__init__.py
index ea519eb..c284bf6 100644
--- a/gwin/io/__init__.py
+++ b/gwin/io/__init__.py
@@ -31,6 +31,7 @@
     EmceeFile.name: EmceeFile,
 }
 
+
 def loadfile(path, mode=None, filetype=None, **kwargs):
     """Loads the given file using the appropriate InferenceFile class.
 
diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index f52bf02..1b3961a 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -459,7 +459,7 @@ def write_command_line(self):
     @abstractmethod
     def write_resume_point(self):
         """Should write the point that a sampler starts up.
-        
+
         How the resume point is indexed is up to the sampler. For example,
         MCMC samplers use the number of iterations that are stored in the
         checkpoint file.
diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py
index b306260..7e1c3b4 100644
--- a/gwin/io/base_mcmc.py
+++ b/gwin/io/base_mcmc.py
@@ -31,6 +31,7 @@
 import numpy
 from .base_hdf import write_kwargs_to_hdf_attrs
 
+
 class MCMCIO(object):
     """Abstract base class that provides some IO functions for ensemble MCMCs.
     """
@@ -176,7 +177,6 @@ def write_sampler_metadata(self, sampler):
         self[self.sampler_group].attrs['nwalkers'] = sampler.nwalkers
         # write the model's metadata
         sampler.model.write_metadata(self)
-        
 
     def write_acls(self, acls):
         """Writes the given autocorrelation lengths.
diff --git a/gwin/sampler/__init__.py b/gwin/sampler/__init__.py
index 5b6e435..aa7cf3a 100644
--- a/gwin/sampler/__init__.py
+++ b/gwin/sampler/__init__.py
@@ -21,15 +21,16 @@
 
 from .base import (initial_dist_from_config, create_new_output_file)
 # from .kombine import KombineSampler
-from .emcee import (EmceeEnsembleSampler, ) # EmceePTSampler)
+from .emcee import EmceeEnsembleSampler
+# from .emcee_pt import EmceePTSampler
 # from .mcmc import MCMCSampler
 
 # list of available samplers
 samplers = {cls.name: cls for cls in (
-    #KombineSampler,
+    # KombineSampler,
     EmceeEnsembleSampler,
-    #EmceePTSampler,
-    #MCMCSampler,
+    # EmceePTSampler,
+    # MCMCSampler,
 )}
 
 
@@ -48,7 +49,7 @@ def load_from_config(cp, model, **kwargs):
     \**kwargs :
         All other keyword arguments are passed directly to the sampler's
         ``from_config`` file.
-    
+
     Returns
     -------
     sampler :
diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py
index 86971f0..41bc2b0 100644
--- a/gwin/sampler/base.py
+++ b/gwin/sampler/base.py
@@ -44,6 +44,7 @@
 # =============================================================================
 #
 
+
 class BaseSampler(object):
     """Base container class for inference samplers.
 
@@ -171,7 +172,7 @@ def setup_output(self, output_file, force=False, injection_file=None):
                                                      backup_file)
         # Create a new file if the checkpoint doesn't exist, or if it is
         # corrupted
-        self.new_checkpoint = False # keeps track if this is a new file or not
+        self.new_checkpoint = False  # keeps track if this is a new file or not
         if not checkpoint_valid:
             logging.info("Checkpoint not found or not valid")
             create_new_output_file(self, checkpoint_file, force=force,
diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py
index a87fa1e..d5afbc0 100644
--- a/gwin/sampler/base_mcmc.py
+++ b/gwin/sampler/base_mcmc.py
@@ -134,8 +134,8 @@ class BaseMCMC(object):
     """
     __metaclass__ = ABCMeta
 
-    _lastclear = None # the iteration when samples were cleared from memory
-    _itercounter = None # the number of iterations since the last clear
+    _lastclear = None  # the iteration when samples were cleared from memory
+    _itercounter = None  # the number of iterations since the last clear
     _pos = None
     _p0 = None
     _nwalkers = None
@@ -340,7 +340,6 @@ def run(self):
             # run the underlying sampler for the desired interval
             self.run_mcmc(iterinterval)
             # update the itercounter
-            #startiter = startiter + iterinterval
             self._itercounter = self._itercounter + iterinterval
             # dump the current results
             self.checkpoint()

From 0a6f82dc450064ead84fd924420ad42ae57d23e2 Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Sat, 4 Aug 2018 21:09:14 +0200
Subject: [PATCH 46/47] fix bugs for running with data

---
 gwin/io/base_hdf.py           |  1 +
 gwin/io/base_mcmc.py          |  5 ++++-
 gwin/models/base.py           |  8 +++++++-
 gwin/models/base_data.py      |  8 +++++++-
 gwin/models/gaussian_noise.py | 18 ++++++++++++++----
 5 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py
index 1b3961a..8a1665c 100644
--- a/gwin/io/base_hdf.py
+++ b/gwin/io/base_hdf.py
@@ -415,6 +415,7 @@ def write_psd(self, psds, group=None):
         if group is None:
             group = subgroup
         else:
+            print group, subgroup
             group = '/'.join([group, subgroup])
         for ifo in psds:
             self[group.format(ifo=ifo)] = psds[ifo]
diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py
index 7e1c3b4..f77247f 100644
--- a/gwin/io/base_mcmc.py
+++ b/gwin/io/base_mcmc.py
@@ -103,7 +103,8 @@ def write_samples(self, samples, parameters=None,
                 istop = istart + niterations
                 self.create_dataset(dataset_name, (nwalkers, istop),
                                     maxshape=(nwalkers, max_iterations),
-                                    dtype=float, fletcher32=True)
+                                    dtype=samples[param].dtype,
+                                    fletcher32=True)
             self[dataset_name][:, istart:istop] = samples[param]
 
     def read_raw_samples(self, fields,
@@ -122,6 +123,8 @@ def read_raw_samples(self, fields,
         dict
             A dictionary of field name -> numpy array pairs.
         """
+        if isinstance(fields, (str, unicode)):
+            fields = [fields]
         # walkers to load
         if walkers is not None:
             widx = numpy.zeros(fp.nwalkers, dtype=bool)
diff --git a/gwin/models/base.py b/gwin/models/base.py
index 9c4598c..a75c881 100644
--- a/gwin/models/base.py
+++ b/gwin/models/base.py
@@ -745,7 +745,13 @@ def from_config(cls, cp, **kwargs):
         return cls(**args)
 
     def write_metadata(self, fp):
-        """Writes metadata to the given file handler."""
+        """Writes metadata to the given file handler.
+        
+        Parameters
+        ----------
+        fp : gwin.io.BaseInferenceFile instance
+            The inference file to write to.
+        """
         fp.attrs['model'] = self.name
         fp.attrs['variable_params'] = list(self.variable_params)
         fp.attrs['sampling_params'] = list(self.sampling_params)
diff --git a/gwin/models/base_data.py b/gwin/models/base_data.py
index b15327f..ebb5723 100644
--- a/gwin/models/base_data.py
+++ b/gwin/models/base_data.py
@@ -238,6 +238,12 @@ def from_config(cls, cp, data, delta_f=None, delta_t=None,
         return cls(**args)
 
     def write_metadata(self, fp):
-        """Adds data to the metadata that's written."""
+        """Adds data to the metadata that's written.
+
+        Parameters
+        ----------
+        fp : gwin.io.BaseInferenceFile instance
+            The inference file to write to.
+        """
         super(BaseDataModel, self).write_metadata(fp)
         fp.write_stilde(self.data)
diff --git a/gwin/models/gaussian_noise.py b/gwin/models/gaussian_noise.py
index 645dbb1..81dfb4e 100644
--- a/gwin/models/gaussian_noise.py
+++ b/gwin/models/gaussian_noise.py
@@ -441,12 +441,22 @@ def write_metadata(self, fp):
         """Adds writing the psds and lognl, since it's a constant.
 
         The lognl is written to the sample group's ``attrs``.
+
+        Parameters
+        ----------
+        fp : gwin.io.BaseInferenceFile instance
+            The inference file to write to.
         """
-        super(GaussianNoise, self).write_data(fp)
-        self.attrs['f_lower'] = self._f_lower
+        super(GaussianNoise, self).write_metadata(fp)
+        fp.attrs['f_lower'] = self._f_lower
         if self._psds is not None:
-            fp.write_psd(self, self._psds)
-        attrs = fp[fp.samples_group].attrs
+            fp.write_psd(self._psds)
+        try:
+            attrs = fp[fp.samples_group].attrs
+        except KeyError:
+            # group doesn't exist, create it
+            fp.create_group(fp.samples_group)
+            attrs = fp[fp.samples_group].attrs
         attrs['lognl'] = self.lognl
         for det in self.detectors:
             attrs['{}_lognl'.format(det)] = self.det_lognl(det)

From 370613ee36e6c55515da98f63fa85f9fb075834e Mon Sep 17 00:00:00 2001
From: Collin Capano <collin.capano@ligo.org>
Date: Sat, 4 Aug 2018 21:11:52 +0200
Subject: [PATCH 47/47] whitespace

---
 gwin/models/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gwin/models/base.py b/gwin/models/base.py
index a75c881..e15dc57 100644
--- a/gwin/models/base.py
+++ b/gwin/models/base.py
@@ -746,7 +746,7 @@ def from_config(cls, cp, **kwargs):
 
     def write_metadata(self, fp):
         """Writes metadata to the given file handler.
-        
+
         Parameters
         ----------
         fp : gwin.io.BaseInferenceFile instance