From 6eaa748aa980622439d5fb69de0eefdb0d2e3440 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Tue, 17 Jul 2018 21:50:29 -0400 Subject: [PATCH 01/47] start changing the base sampler api --- gwin/sampler/base.py | 885 ++++--------------------------------------- 1 file changed, 66 insertions(+), 819 deletions(-) diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py index 3601c5b..a7a8fad 100644 --- a/gwin/sampler/base.py +++ b/gwin/sampler/base.py @@ -22,10 +22,10 @@ # ============================================================================= # """ -This modules provides classes and functions for using different sampler -packages for parameter estimation. +Defines the base sampler class to be inherited by all samplers. """ +from abc import ABCMeta import numpy from pycbc.io import FieldArray from pycbc.filter import autocorrelation @@ -41,28 +41,26 @@ # ============================================================================= # -class _BaseSampler(object): - """Base container class for running the inference sampler that will - generate the posterior distributions. +class BaseSampler(object): + """Base container class for inference samplers. Parameters ---------- model : Model An instance of a model from ``gwin.models``. """ + __metaclass__ = ABCMeta name = None def __init__(self, model): self.model = model - self.lastclear = 0 - @classmethod - def from_cli(cls, opts, model, pool=None, - model_call=None): - """This function create an instance of this sampler from the given - command-line options. + #@classmethod # uncomment when we move to python 3.3 + @abstractmethod + def from_config(cls, cp, model, pool=None, model_call=None, **kwargs): + """This should initialize the sampler given a config file. """ - raise NotImplementedError("from_cli function not set") + pass @property def variable_params(self): @@ -72,841 +70,90 @@ def variable_params(self): @property def sampling_params(self): - """Returns the sampling args used by the model. + """Returns the sampling params used by the model. """ return self.model.sampling_params @property - def chain(self): - """This function should return the past samples as a - [additional dimensions x] niterations x ndim array, where ndim are the - number of model params, niterations the number of iterations, and - additional dimeionions are any additional dimensions used by the - sampler (e.g, walkers, temperatures). + def static_params(self): + """Returns the model's fixed parameters. """ - return NotImplementedError("chain function not set.") + return self.model.static_params - @property + @abstractproperty def samples(self): - """This function should return the past samples as a [additional - dimensions x] niterations field array, where the fields are union - of the sampling args and the model params. - """ - return NotImplementedError("samples function not set.") - - @property - def clear_chain(self): - """This function should clear the current chain of samples from memory. - """ - return NotImplementedError("clear chain function not set.") - - @property - def niterations(self): - """Get the current number of iterations.""" - return self.chain.shape[-2] + self.lastclear - - @property - def acceptance_fraction(self): - """This function should return the fraction of steps accepted by each - walker as an array. + """Should return all of the samples currently stored in memory as a + numpy structure array or FieldArray. """ - return NotImplementedError("acceptance_fraction function not set.") + pass - @property - def lnpost(self): - """This function should return the natural logarithm of the likelihood - function used by the sampler as an - [additional dimensions] x niterations array. - """ - return NotImplementedError("lnpost function not set.") - - @property + @abstractproperty def model_stats(self): - """This function should return the prior and likelihood ratio of - samples as an [additional dimensions] x niterations - array. If the model did not return that info to the - sampler, it should return None. + """Should return all of the model's metadata currently stored in + memory as a numpy structure array or FieldArray. """ - return NotImplementedError("model stats not set") + pass - def burn_in(self, initial_values): - """This function should burn in the sampler. - """ - raise NotImplementedError("This sampler has no burn_in function.") - - def run(self, niterations): + @abstractmethod + def run(self): """This function should run the sampler. + + Any checkpointing should be done internally in this function. """ - raise NotImplementedError("run function not set.") - - @classmethod - def calculate_logevidence(cls, fp): - """This function should calculate the log evidence and its error using - the results in the given file. If the sampler does not support evidence - calculation, then this will raise a NotImplementedError. - """ - raise NotImplementedError("this sampler does not support evidence " - "calculation") - - # write and read functions - def write_metadata(self, fp, **kwargs): - """Writes metadata about this sampler to the given file. Metadata is - written to the file's `attrs`. - - Parameters - ---------- - fp : InferenceFile - A file handler to an open inference file. - **kwargs : - All keyword arguments are saved as separate arguments in the - file attrs. If any keyword argument is a dictionary, the keyword - will point to the list of keys in the the file's ``attrs``. Each - key is then stored as a separate attr with its corresponding value. - """ - fp.attrs['sampler'] = self.name - fp.attrs['model'] = self.model.name - fp.attrs['variable_params'] = list(self.variable_params) - fp.attrs['sampling_params'] = list(self.sampling_params) - fp.attrs["niterations"] = self.niterations - try: - fp.attrs["lognl"] = self.model.lognl - except AttributeError: - pass - for arg, val in kwargs.items(): - if val is None: - val = str(None) - if isinstance(val, dict): - fp.attrs[arg] = val.keys() - for key, item in val.items(): - if item is None: - item = str(None) - fp.attrs[key] = item - else: - fp.attrs[arg] = val - - @staticmethod - def write_logevidence(fp, lnz, dlnz): - """Writes the given log evidence and its error to the given file. - Results are saved to the file's 'log_evidence' and 'dlog_evidence' - attributes. - - Parameters - ---------- - fp : InferenceFile - A file handler to an open inference file. - lnz : float - The log of the evidence. - dlnz : float - The error in the estimate of the log evidence. - """ - fp.attrs['log_evidence'] = lnz - fp.attrs['dlog_evidence'] = dlnz - - @staticmethod - def write_burn_in_iterations(fp, burn_in_iterations, is_burned_in=None): - """Writes the burn in iterations to the given file. - - Parameters - ---------- - fp : InferenceFile - A file handler to an open inference file. - burn_in_iterations : array - Array of values giving the iteration of the burn in of each walker. - is_burned_in : array - Array of booleans indicating which chains are burned in. - """ - try: - fp['burn_in_iterations'][:] = burn_in_iterations - except KeyError: - fp['burn_in_iterations'] = burn_in_iterations - fp.attrs['burn_in_iterations'] = burn_in_iterations.max() - if is_burned_in is not None: - try: - fp['is_burned_in'][:] = is_burned_in - except KeyError: - fp['is_burned_in'] = is_burned_in - fp.attrs['is_burned_in'] = is_burned_in.all() - - @staticmethod - def write_state(fp): - """Saves the state of the sampler in a file. - """ - fp.write_random_state() - - @staticmethod - def set_state_from_file(fp): - """Sets the state of the sampler back to the instance saved in a file. - """ - numpy.random.set_state(fp.read_random_state()) - - -class BaseMCMCSampler(_BaseSampler): - """This class is used to construct the MCMC sampler from the kombine-like - packages. - - Parameters - ---------- - sampler : sampler instance - An instance of an MCMC sampler similar to kombine or emcee. - model : model class - A model from ``gwin.models``. - - Attributes - ---------- - sampler : - The MCMC sampler instance used. - p0 : nwalkers x ndim array - The initial position of the walkers. Set by using set_p0. If not set - yet, a ValueError is raised when the attribute is accessed. - pos : {None, array} - An array of the current walker positions. - """ - name = None - - def __init__(self, sampler, model): - self._sampler = sampler - self._pos = None - self._p0 = None - self._currentblob = None - self._nwalkers = None - self.lastclear = 0 - self.burn_in_iterations = None - # initialize - super(BaseMCMCSampler, self).__init__(model) - - @property - def sampler(self): - return self._sampler - - @property - def pos(self): - return self._pos - - def set_p0(self, samples_file=None, prior=None): - """Sets the initial position of the walkers. - - Parameters - ---------- - samples_file : InferenceFile, optional - If provided, use the last iteration in the given file for the - starting positions. - prior : JointDistribution, optional - Use the given prior to set the initial positions rather than - ``model``'s prior. - - Returns - ------- - p0 : array - An nwalkers x ndim array of the initial positions that were set. - """ - # create a (nwalker, ndim) array for initial positions - nwalkers = self.nwalkers - ndim = len(self.variable_params) - p0 = numpy.ones((nwalkers, ndim)) - # if samples are given then use those as initial positions - if samples_file is not None: - samples = self.read_samples(samples_file, self.variable_params, - iteration=-1) - # transform to sampling parameter space - samples = self.model.apply_sampling_transforms(samples) - # draw random samples if samples are not provided - else: - samples = self.model.prior_rvs(size=nwalkers, prior=prior) - # convert to 2D array - for i, param in enumerate(self.sampling_params): - p0[:, i] = samples[param] - self._p0 = p0 - return p0 - - @property - def p0(self): - if self._p0 is None: - raise ValueError("initial positions not set; run set_p0") - return self._p0 - - @property - def nwalkers(self): - """Get the number of walkers.""" - return self._nwalkers - - @property - def acceptance_fraction(self): - """Get the fraction of steps accepted by each walker as an array. - """ - return self._sampler.acceptance_fraction - - @property - def samples(self): - """Returns the samples in the chain as a FieldArray. - - If the sampling args are not the same as the model params, the - returned samples will have both the sampling and the model params. - - The returned FieldArray has dimension [additional dimensions x] - nwalkers x niterations. - """ - # chain is a [additional dimensions x] niterations x ndim array - samples = self.chain - sampling_params = self.sampling_params - # convert to dictionary to apply boundary conditions - samples = {param: samples[..., ii] for - ii, param in enumerate(sampling_params)} - samples = self.model.prior_distribution.apply_boundary_conditions( - **samples) - # now convert to field array - samples = FieldArray.from_arrays([samples[param] - for param in sampling_params], - names=sampling_params) - # apply transforms to go to model params space - if self.model.sampling_transforms is not None: - samples = self.model.sampling_transforms.apply(samples, - inverse=True) - return samples + pass - @property - def model_stats(self): - """Returns the model stats as a FieldArray, with field names - corresponding to the type of data returned by the model. - The returned array has shape nwalkers x niterations. If no additional - stats were returned to the sampler by the model, returns - None. - """ - stats = numpy.array(self._sampler.blobs) - if stats.size == 0: - return None - # we'll force arrays to float; this way, if there are `None`s in the - # blobs, they will be changed to `nan`s - arrays = {field: stats[..., fi].astype(float) - for fi, field in - enumerate(self.model.default_stats)} - return FieldArray.from_kwargs(**arrays).transpose() + @abstractmethod + def write_samples(cls, fp, samples, group="samples", **kwargs): + """This should write all of the provided samples to the given hdf file. - # write and read functions - def write_metadata(self, fp, **kwargs): - """Writes metadata about this sampler to the given file. Metadata is - written to the file's `attrs`. + This function should be used to write both samples and model stats. Parameters ---------- - fp : InferenceFile - A file handler to an open inference file. - **kwargs : - All keyword args are written to the file's ``attrs``. - """ - super(BaseMCMCSampler, self).write_metadata(fp, **kwargs) - # add info about walkers, burn in - fp.attrs["nwalkers"] = self.nwalkers - - @staticmethod - def write_samples_group(fp, samples_group, parameters, samples, - start_iteration=None, max_iterations=None): - """Writes samples to the given file. - - Results are written to: - - ``fp[samples_group/{vararg}]``, - - where ``{vararg}`` is the name of a model params. The samples are - written as an ``nwalkers x niterations`` array. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - samples_group : str - Name of samples group to write. - parameters : list - The parameters to write to the file. - samples : FieldArray - The samples to write. Should be a FieldArray with fields containing - the samples to write and shape nwalkers x niterations. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. + fp : open hdf file + The file to write to. + samples : structure array-like + Samples should be provided as a numpy structure array or a + FieldArray (basically, anything for which ``samples['param']`` will + return a numpy array). + group : str, optional + The group in ``fp`` to write the ``samples`` to. Default is + "samples". + \**kwargs : + Any other keyword args the sampler needs to write data. """ - nwalkers, niterations = samples.shape - if max_iterations is not None and max_iterations < niterations: - raise IndexError("The provided max size is less than the " - "number of iterations") - group = samples_group + '/{name}' - # loop over number of dimensions - for param in parameters: - dataset_name = group.format(name=param) - istart = start_iteration - try: - fp_niterations = fp[dataset_name].shape[-1] - if istart is None: - istart = fp_niterations - istop = istart + niterations - if istop > fp_niterations: - # resize the dataset - fp[dataset_name].resize(istop, axis=1) - except KeyError: - # dataset doesn't exist yet - if istart is not None and istart != 0: - raise ValueError("non-zero start_iteration provided, " - "but dataset doesn't exist yet") - istart = 0 - istop = istart + niterations - fp.create_dataset(dataset_name, (nwalkers, istop), - maxshape=(nwalkers, max_iterations), - dtype=float, fletcher32=True) - fp[dataset_name][:, istart:istop] = samples[param] - - def write_chain(self, fp, start_iteration=None, max_iterations=None): - """Writes the samples from the current chain to the given file. - - Results are written to: + pass - `fp[fp.samples_group/{field}/(temp{k}/)walker{i}]`, + @abstractmethod + def read_samples(cls, fp, parameters, group="samples", **kwargs): + """This should read the requested parameters from the given hdf file. - where `{i}` is the index of a walker, `{field}` is the name of each - field returned by ``model_stats``, and, if the sampler is - multitempered, `{k}` is the temperature. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. - samples_group : str - Name of samples group to write. - """ - # samples is a nwalkers x niterations field array - samples = self.samples - parameters = self.variable_params - samples_group = fp.samples_group - # write data - self.write_samples_group(fp, samples_group, parameters, samples, - start_iteration=start_iteration, - max_iterations=max_iterations) - - def write_model_stats(self, fp, start_iteration=None, - max_iterations=None): - """Writes the ``model_stats`` to the given file. - - Results are written to: - - `fp[fp.stats_group/{field}/(temp{k}/)walker{i}]`, - - where `{i}` is the index of a walker, `{field}` is the name of each - field returned by ``model_stats``, and, if the sampler is - multitempered, `{k}` is the temperature. If nothing is returned by - ``model_stats``, this does nothing. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. - - Returns - ------- - stats : {FieldArray, None} - The stats that were written, as a FieldArray. If there were no - stats, returns None. - """ - samples = self.model_stats - if samples is None: - return None - # ensure the prior is in the model params parameter space - if 'logjacobian' in samples.fieldnames: - samples['logprior'] -= samples['logjacobian'] - parameters = samples.fieldnames - samples_group = fp.stats_group - # write data - self.write_samples_group(fp, samples_group, parameters, samples, - start_iteration=start_iteration, - max_iterations=max_iterations) - return samples - - def write_acceptance_fraction(self, fp): - """Write acceptance_fraction data to file. Results are written to - `fp[acceptance_fraction]`. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - """ - dataset_name = "acceptance_fraction" - try: - fp[dataset_name][:] = self.acceptance_fraction - except KeyError: - # dataset doesn't exist yet, create it - fp[dataset_name] = self.acceptance_fraction - - def write_results(self, fp, start_iteration=None, - max_iterations=None, **metadata): - """Writes metadata, samples, model stats, and acceptance fraction - to the given file. Also computes and writes the autocorrleation lengths - of the chains. See the various write function for details. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the acceptance fraction has not previously been - written to the file. The default (None) is to use the maximum size - allowed by h5py. - \**metadata : - All other keyword arguments are passed to ``write_metadata``. - """ - self.write_metadata(fp, **metadata) - self.write_chain(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_model_stats(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_acceptance_fraction(fp) - self.write_state(fp) - - @staticmethod - def _read_fields(fp, fields_group, fields, array_class, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None, walkers=None, flatten=True): - """Base function for reading samples and model stats. See - `read_samples` and `read_model_stats` for details. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - fields_group : str - The name of the group to retrieve the desired fields. - fields : list - The list of field names to retrieve. Must be names of groups in - `fp[fields_group/]`. - array_class : FieldArray or similar - The type of array to return. Must have a `from_kwargs` attribute. - - For other details on keyword arguments, see `read_samples` and - `read_model_stats`. - - Returns - ------- - array_class - An instance of the given array class populated with values - retrieved from the fields. - """ - # walkers to load - if walkers is not None: - widx = numpy.zeros(fp.nwalkers, dtype=bool) - widx[walkers] = True - else: - widx = slice(0, None) - # get the slice to use - if iteration is not None: - get_index = iteration - else: - if thin_end is None: - # use the number of current iterations - thin_end = fp.niterations - get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end, - thin_interval=thin_interval) - # load - arrays = {} - group = fields_group + '/{name}' - for name in fields: - arr = fp[group.format(name=name)][widx, get_index] - if flatten: - arr = arr.flatten() - arrays[name] = arr - return array_class.from_kwargs(**arrays) - - @classmethod - def read_samples(cls, fp, parameters, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None, walkers=None, flatten=True, - samples_group=None, array_class=None): - """Reads samples for the given parameter(s). - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - parameters : (list of) strings - The parameter(s) to retrieve. A parameter can be the name of any - field in `fp[fp.samples_group]`, a virtual field or method of - `FieldArray` (as long as the file contains the necessary fields - to derive the virtual field or method), and/or a function of - these. - thin_start : int - Index of the sample to begin returning samples. Default is to read - samples after burn in. To start from the beginning set thin_start - to 0. - thin_interval : int - Interval to accept every i-th sample. Default is to use the - `fp.acl`. If `fp.acl` is not set, then use all samples - (set thin_interval to 1). - thin_end : int - Index of the last sample to read. If not given then - `fp.niterations` is used. - iteration : int - Get a single iteration. If provided, will override the - `thin_{start/interval/end}` arguments. - walkers : {None, (list of) int} - The walker index (or a list of indices) to retrieve. If None, - samples from all walkers will be obtained. - flatten : {True, bool} - The returned array will be one dimensional, with all desired - samples from all desired walkers concatenated together. If False, - the returned array will have dimension requested walkers - x requested iterations. - samples_group : {None, str} - The group in `fp` from which to retrieve the parameter fields. If - None, searches in `fp.samples_group`. - array_class : {None, array class} - The type of array to return. The class must have a `from_kwargs` - class method and a `parse_parameters` method. If None, will return - a FieldArray. - - Returns - ------- - array_class - Samples for the given parameters, as an instance of a the given - `array_class` (`FieldArray` if `array_class` is None). - """ - # get the group to load from - if samples_group is None: - samples_group = fp.samples_group - # get the type of array class to use - if array_class is None: - array_class = FieldArray - # get the names of fields needed for the given parameters - possible_fields = fp[samples_group].keys() - loadfields = array_class.parse_parameters(parameters, possible_fields) - return cls._read_fields(fp, samples_group, loadfields, array_class, - thin_start=thin_start, - thin_interval=thin_interval, thin_end=thin_end, - iteration=iteration, walkers=walkers, - flatten=flatten) - - @classmethod - def n_independent_samples(cls, fp): - """Returns the number of independent samples stored in a file. - - The number of independent samples are counted starting from after - burn-in. If the sampler hasn't burned in yet, then 0 is returned. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read. - - Returns - ------- - int - The number of independent samples. - """ - # check if burned in - if not fp.is_burned_in: - return 0 - # we'll just read a single parameter from the file - samples = cls.read_samples(fp, fp.variable_params[0]) - return samples.size - - @staticmethod - def read_acceptance_fraction(fp, walkers=None): - """Reads the acceptance fraction from the given file. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - walkers : {None, (list of) int} - The walker index (or a list of indices) to retrieve. If None, - samples from all walkers will be obtained. - - Returns - ------- - array - Array of acceptance fractions with shape (requested walkers,). - """ - group = 'acceptance_fraction' - if walkers is None: - wmask = numpy.ones(fp.nwalkers, dtype=bool) - else: - wmask = numpy.zeros(fp.nwalkers, dtype=bool) - wmask[walkers] = True - return fp[group][wmask] - - @classmethod - def compute_acfs(cls, fp, start_index=None, end_index=None, - per_walker=False, walkers=None, parameters=None): - """Computes the autocorrleation function of the model params in the - given file. - - By default, parameter values are averaged over all walkers at each - iteration. The ACF is then calculated over the averaged chain. An - ACF per-walker will be returned instead if ``per_walker=True``. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - start_index : {None, int} - The start index to compute the acl from. If None, will try to use - the number of burn-in iterations in the file; otherwise, will start - at the first sample. - end_index : {None, int} - The end index to compute the acl to. If None, will go to the end - of the current iteration. - per_walker : optional, bool - Return the ACF for each walker separately. Default is False. - walkers : optional, int or array - Calculate the ACF using only the given walkers. If None (the - default) all walkers will be used. - parameters : optional, str or array - Calculate the ACF for only the given parameters. If None (the - default) will calculate the ACF for all of the model params. - - Returns - ------- - FieldArray - A ``FieldArray`` of the ACF vs iteration for each parameter. If - `per-walker` is True, the FieldArray will have shape - ``nwalkers x niterations``. - """ - acfs = {} - if parameters is None: - parameters = fp.variable_params - if isinstance(parameters, str) or isinstance(parameters, unicode): - parameters = [parameters] - for param in parameters: - if per_walker: - # just call myself with a single walker - if walkers is None: - walkers = numpy.arange(fp.nwalkers) - arrays = [cls.compute_acfs(fp, start_index=start_index, - end_index=end_index, - per_walker=False, walkers=ii, - parameters=param)[param] - for ii in walkers] - acfs[param] = numpy.vstack(arrays) - else: - samples = cls.read_samples(fp, param, - thin_start=start_index, - thin_interval=1, thin_end=end_index, - walkers=walkers, - flatten=False)[param] - samples = samples.mean(axis=0) - acfs[param] = autocorrelation.calculate_acf(samples).numpy() - return FieldArray.from_kwargs(**acfs) - - @classmethod - def compute_acls(cls, fp, start_index=None, end_index=None): - """Computes the autocorrleation length for all model params in the - given file. - - Parameter values are averaged over all walkers at each iteration. - The ACL is then calculated over the averaged chain. If the returned ACL - is `inf`, will default to the number of current iterations. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - start_index : {None, int} - The start index to compute the acl from. If None, will try to use - the number of burn-in iterations in the file; otherwise, will start - at the first sample. - end_index : {None, int} - The end index to compute the acl to. If None, will go to the end - of the current iteration. - - Returns - ------- - dict - A dictionary giving the ACL for each parameter. - """ - acls = {} - for param in fp.variable_params: - samples = cls.read_samples(fp, param, - thin_start=start_index, - thin_interval=1, thin_end=end_index, - flatten=False)[param] - samples = samples.mean(axis=0) - acl = autocorrelation.calculate_acl(samples) - if numpy.isinf(acl): - acl = samples.size - acls[param] = acl - return acls - - @staticmethod - def write_acls(fp, acls): - """Writes the given autocorrelation lengths to the given file. - - The ACL of each parameter is saved to ``fp['acls/{param}']``. - The maximum over all the parameters is saved to the file's 'acl' - attribute. + The samples should be returned as a ``FieldArray``. Parameters ---------- - fp : InferenceFile - An open file handler to write the samples to. - acls : dict - A dictionary of ACLs keyed by the parameter. - - Returns - ------- - ACL - The maximum of the acls that was written to the file. + fp : open hdf file + The file to read from. + parameters : list of str + List of the parameters to return. May include functions. + group : str, optional + The group in ``fp`` to read the ``samples`` from. Default is + "samples". + \**kwargs : + Any other keyword args the sampler needs to read data. """ - group = 'acls/{}' - # write the individual acls - for param in acls: - try: - # we need to use the write_direct function because it's - # apparently the only way to update scalars in h5py - fp[group.format(param)].write_direct(numpy.array(acls[param])) - except KeyError: - # dataset doesn't exist yet - fp[group.format(param)] = acls[param] - # write the maximum over all params - fp.attrs['acl'] = numpy.array(acls.values()).max() - return fp.attrs['acl'] + pass - @staticmethod - def read_acls(fp): - """Reads the acls of all the parameters in the given file. + @abstractmethod + def write_posterior(cls, posterior_fp, **kwargs): + """This should write a posterior plus any other metadata to the given + file. Parameters ---------- - fp : InferenceFile - An open file handler to read the acls from. - - Returns - ------- - dict - A dictionary of the ACLs, keyed by the parameter name. + posterior_fp : open hdf file + The file to write to. + \**kwargs : + Any other keyword args the sampler needs to write the posterior. """ - group = fp['acls'] - return {param: group[param].value for param in group.keys()} + pass From d41964dbccef15130d20fe1c498f0f1e4c7259eb Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Thu, 12 Jul 2018 18:50:37 +0200 Subject: [PATCH 02/47] start InferenceFile -> BaseInferenceFile --- gwin/io/hdf.py | 637 +++++++++++++++++++------------------------------ 1 file changed, 247 insertions(+), 390 deletions(-) diff --git a/gwin/io/hdf.py b/gwin/io/hdf.py index 1799694..a89172e 100644 --- a/gwin/io/hdf.py +++ b/gwin/io/hdf.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016 Christopher M. Biwer +# Copyright (C) 2016 Christopher M. Biwer, Collin Capano # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; either version 3 of the License, or (at your @@ -28,6 +28,7 @@ import os import sys import logging +from abc import ABCMeta import numpy @@ -41,64 +42,7 @@ from .. import sampler as gwin_sampler -class _PosteriorOnlyParser(object): - """Provides interface for reading/writing samples from/to an InferenceFile - that contains flattened posterior samples. - """ - @staticmethod - def _read_fields(fp, fields_group, fields, array_class, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None): - """Reads fields from the given file. - """ - if iteration is not None: - get_index = iteration - else: - get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end, - thin_interval=thin_interval) - # load - arrays = {} - group = fields_group + '/{}' - arrays = {field: fp[group.format(field)][get_index] - for field in fields} - return array_class.from_kwargs(**arrays) - - @classmethod - def read_samples(cls, fp, parameters, samples_group=None, - thin_start=0, thin_end=None, thin_interval=1, - iteration=None, array_class=None): - """Reads posterior samples from a posterior-only file. - """ - # get the group to load from - if samples_group is None: - samples_group = fp.samples_group - # get the type of array class to use - if array_class is None: - array_class = FieldArray - # get the names of fields needed for the given parameters - possible_fields = fp[samples_group].keys() - loadfields = array_class.parse_parameters(parameters, possible_fields) - return cls._read_fields(fp, samples_group, loadfields, array_class, - thin_start=thin_start, - thin_interval=thin_interval, thin_end=thin_end, - iteration=iteration) - - @staticmethod - def write_samples_group(fp, samples_group, fields, samples): - """Writes the given samples to the given samples group. - """ - for field in samples.fieldnames: - grp = '{}/{}'.format(samples_group, field) - fp[grp] = samples[field] - - @classmethod - def n_independent_samples(cls, fp): - """Returns the number of independent samples stored in the file. - """ - return cls.read_samples(fp, fp.variable_params[0]).size - - -class InferenceFile(h5py.File): +class BaseInferenceFile(h5py.File): """ A subclass of the h5py.File object that has extra functions for handling reading and writing the samples from the samplers. @@ -109,139 +53,105 @@ class InferenceFile(h5py.File): mode : {None, str} The mode to open the file, eg. "w" for write and "r" for read. """ - name = "hdf" + __metaclass__ = ABCMeta + + name = None samples_group = 'samples' - stats_group = 'model_stats' - sampler_group = 'sampler_states' + sampler_group = 'sampler_info' + data_group = 'data' + injections_group = 'injections' def __init__(self, path, mode=None, **kwargs): super(InferenceFile, self).__init__(path, mode, **kwargs) - @property - def posterior_only(self): - """Whether the file only contains flattened posterior samples. + def __getattr__(self, attr): + """Things stored in ``.attrs`` are promoted to instance attributes. + + Note that properties will be called before this, so if there are any + properties that share the same name as something in ``.attrs``, that + property will get returned. """ - try: - return self.attrs['posterior_only'] - except KeyError: - return False - - @property - def sampler_name(self): - """Returns the name of the sampler that was used.""" - return self.attrs["sampler"] - - @property - def sampler_class(self): - """Returns the sampler class that was used.""" - try: - sampler = self.sampler_name - except KeyError: - return None - return gwin_sampler.samplers[sampler] - - @property - def samples_parser(self): - """Returns the class to use to read/write samples from/to the file.""" - if self.posterior_only: - return _PosteriorOnlyParser - else: - return self.sampler_class - - @property - def model_name(self): - """Returns the name of the model that was used.""" - return self.attrs["model"] + return self.attrs[attr] - @property - def variable_params(self): - """Returns list of variable_params. + @abstractmethod + def write_samples(self, samples, **kwargs): + """This should write all of the provided samples. - Returns - ------- - variable_params : {list, str} - List of str that contain variable_params keys. - """ - return self.attrs["variable_params"] + This function should be used to write both samples and model stats. - @property - def static_params(self): - """Returns a dictionary of the static_params. The keys are the argument - names, values are the value they were set to. + Parameters + ---------- + fp : open hdf file + The file to write to. + samples : structure array-like + Samples should be provided as a numpy structure array or a + FieldArray (basically, anything for which ``samples['param']`` will + return a numpy array). + \**kwargs : + Any other keyword args the sampler needs to write data. """ - return {arg: self.attrs[arg] for arg in self.attrs["static_params"]} + pass - @property - def sampling_params(self): - """Returns the parameters that were used to sample. + @abstractmethod + def read_samples(self, parameters, **kwargs): + """This should read the requested parameters. - Returns - ------- - sampling_params : {list, str} - List of the sampling params. - """ - return self.attrs["sampling_params"] - - @property - def lognl(self): - """Returns the log noise likelihood.""" - return self.attrs["lognl"] + The samples should be returned as a ``FieldArray``. - @property - def niterations(self): - """Returns number of iterations performed. + Parameters + ---------- + fp : open hdf file + The file to read from. + parameters : list of str + List of the parameters to return. May include functions. + \**kwargs : + Any other keyword args the sampler needs to read data. Returns ------- - niterations : int - Number of iterations performed. + FieldArray : + The samples as a FieldArray. """ - return self.attrs["niterations"] + pass - @property - def n_independent_samples(self): - """Returns the number of independent samples stored in the file. - """ - return self.samples_parser.n_independent_samples(self) + @abstractmethod + def write_posterior(self, posterior_fp, **kwargs): + """This should write a posterior plus any other metadata to the given + file. - @property - def burn_in_iterations(self): - """Returns number of iterations in the burn in. + Parameters + ---------- + posterior_fp : open hdf file + The file to write to. + \**kwargs : + Any other keyword args the sampler needs to write the posterior. """ - return self.attrs["burn_in_iterations"] + pass @property - def is_burned_in(self): - """Returns whether or not the sampler is burned in. - """ - return self.attrs["is_burned_in"] + def sampler_class(self): + """Returns the sampler class that was used.""" + try: + sampler = self.sampler_name + except KeyError: + return None + return gwin_sampler.samplers[sampler] @property - def nwalkers(self): - """Returns number of walkers used. - - Returns - ------- - nwalkesr : int - Number of walkers used. + def static_params(self): + """Returns a dictionary of the static_params. The keys are the argument + names, values are the value they were set to. """ - return self.attrs["nwalkers"] - - @property - def ntemps(self): - """Returns number of temperatures used.""" - return self.attrs["ntemps"] + return {arg: self.attrs[arg] for arg in self.attrs["static_params"]} @property - def acl(self): - """ Returns the saved autocorelation length (ACL). - - Returns - ------- - acl : {int, float} - The ACL. + def n_independent_samples(self): + """Returns the number of independent samples stored in the file. """ - return self.attrs["acl"] + try: + return self.attrs['n_independent_samples'] + except KeyError: + return 0 @property def cmd(self): @@ -260,21 +170,54 @@ def cmd(self): cmd = cmd[-1] return cmd - @property - def resume_points(self): - """The iterations at which a run was resumed from checkpoint. + def write_metadata(self, sampler, **kwargs): + """Writes the sampler's metadata. - Returns - ------- - resume_points : array or None - An array of integers giving the points at which the run resumed. + Parameters + ---------- + sampler : gwin.sampler + An instance of a gwin sampler. + **kwargs : + All keyword arguments are saved as separate arguments in the + file attrs. If any keyword argument is a dictionary, the keyword + will point to the list of keys in the the file's ``attrs``. Each + key is then stored as a separate attr with its corresponding value. + """ + self.attrs['sampler'] = samlper.name + self.attrs['model'] = sampler.model.name + self.attrs['variable_params'] = list(sampler.variable_params) + self.attrs['sampling_params'] = list(sampler.sampling_params) + # FIXME: what will write this? + #fp.attrs["lognl"] = self.model.lognl + # add the static params to the kwargs + kwargs['static_params'] = sampler.static_params + for arg, val in kwargs.items(): + if val is None: + val = str(None) + if isinstance(val, dict): + self.attrs[arg] = val.keys() + for key, item in val.items(): + if item is None: + item = str(None) + self.attrs[key] = item + else: + self.attrs[arg] = val + + def write_logevidence(self, lnz, dlnz): + """Writes the given log evidence and its error. - Raises - ------ - KeyError - If the run never resumed from a checkpoint. + Results are saved to file's 'log_evidence' and 'dlog_evidence' + attributes. + + Parameters + ---------- + lnz : float + The log of the evidence. + dlnz : float + The error in the estimate of the log evidence. """ - return self.attrs['resume_points'] + self.attrs['log_evidence'] = lnz + self.attrs['dlog_evidence'] = dlnz @property def log_evidence(self): @@ -283,115 +226,37 @@ def log_evidence(self): """ return self.attrs["log_evidence"], self.attrs["dlog_evidence"] - def read_samples(self, parameters, samples_group=None, **kwargs): - """Reads samples from the file. - - Parameters - ----------- - parameters : (list of) strings - The parameter(s) to retrieve. A parameter can be the name of any - field in `samples_group`, a virtual field or method of - `FieldArray` (as long as the file contains the necessary fields - to derive the virtual field or method), and/or a function of - these. - samples_group : str - Group in HDF InferenceFile that parameters belong to. - **kwargs : - The rest of the keyword args are passed to the sampler's - `read_samples` method. - - Returns - ------- - FieldArray - Samples for the given parameters, as an instance of a - FieldArray. - """ - # get the appropriate sampler class - samples_group = samples_group if samples_group else self.samples_group - return self.samples_parser.read_samples(self, parameters, - samples_group=samples_group, - **kwargs) - - def read_model_stats(self, **kwargs): - """Reads model stats from self. - - Parameters - ----------- - **kwargs : - The keyword args are passed to the sampler's - ``read_model_stats`` method. - - Returns - ------- - stats : {FieldArray, None} - Likelihood stats in the file, as a FieldArray. The fields of the - array are the names of the stats that are in the ``model_stats`` - group. - """ - parameters = self[self.stats_group].keys() - return self.read_samples(parameters, samples_group=self.stats_group, - **kwargs) + def write_random_state(self, group=None, state=None): + """Writes the state of the random number generator from the file. - def read_acceptance_fraction(self, **kwargs): - """Returns the acceptance fraction that was written to the file. + The random state is written to ``sampler_group``/random_state. Parameters ---------- - **kwargs : - All keyword arguments are passed to the sampler's - `read_acceptance_fraction` function. - Returns - ------- - numpy.array - The acceptance fraction. - """ - return self.sampler_class.read_acceptance_fraction(self, **kwargs) - - def read_acls(self): - """Returns all of the individual chains' acls. See the `read_acls` - function of this file's sampler for more details. - """ - return self.sampler_class.read_acls(self) - - def read_label(self, parameter, error_on_none=False): - """Returns the label for the parameter. - - Parameters - ----------- - parameter : str - Name of parameter to get a label for. Will first try to retrieve - a label from this file's "label" attributes. If the parameter - is not found there, will look for a label from - pycbc.waveform.parameters. - error_on_none : {False, bool} - If True, will raise a ValueError if a label cannot be found, or if - the label is None. Otherwise, the parameter will just be returned - if no label can be found. - - Returns - ------- - label : str - A formatted string for the name of the paramter. + group : str + Name of group to write random state to. + state : tuple, optional + Specify the random state to write. If None, will use + ``numpy.random.get_state()``. """ - # get label - try: - label = self[parameter].attrs["label"] - except KeyError: - # try looking in pycbc.waveform.parameters - try: - label = getattr(wfparams, parameter).label - except AttributeError: - label = None - if label is None: - if error_on_none: - raise ValueError("Cannot find a label for paramter %s" % ( - parameter)) - else: - return parameter - return label + group = self.sampler_group if group is None else group + dataset_name = "/".join([group, "random_state"]) + if state is None: + state = numpy.random.get_state() + s, arr, pos, has_gauss, cached_gauss = state + if group in self: + self[dataset_name][:] = arr + else: + self.create_dataset(dataset_name, arr.shape, fletcher32=True, + dtype=arr.dtype) + self[dataset_name][:] = arr + self[dataset_name].attrs["s"] = s + self[dataset_name].attrs["pos"] = pos + self[dataset_name].attrs["has_gauss"] = has_gauss + self[dataset_name].attrs["cached_gauss"] = cached_gauss def read_random_state(self, group=None): - """ Reads the state of the random number generator from the file. + """Reads the state of the random number generator from the file. Parameters ---------- @@ -412,6 +277,11 @@ def read_random_state(self, group=None): cached_gauss = self[dataset_name].attrs["cached_gauss"] return s, arr, pos, has_gauss, cached_gauss + def load_random_state(self): + """Sets numpy's random state using what is saved in the file. + """ + numpy.random.set_state(self.read_random_state()) + def write_strain(self, strain_dict, group=None): """Writes strain for each IFO to file. @@ -423,7 +293,7 @@ def write_strain(self, strain_dict, group=None): The group to write the strain to. If None, will write to the top level. """ - subgroup = "{ifo}/strain" + subgroup = self.data_group + "/{ifo}/strain" if group is None: group = subgroup else: @@ -445,7 +315,7 @@ def write_stilde(self, stilde_dict, group=None): The group to write the strain to. If None, will write to the top level. """ - subgroup = "{ifo}/stilde" + subgroup = self.data_group + "/{ifo}/stilde" if group is None: group = subgroup else: @@ -469,7 +339,7 @@ def write_psd(self, psds, low_frequency_cutoff, group=None): The group to write the strain to. If None, will write to the top level. """ - subgroup = "{ifo}/psds/0" + subgroup = self.data_group + "/{ifo}/psds/0" if group is None: group = subgroup else: @@ -522,24 +392,19 @@ def write_data(self, strain_dict=None, stilde_dict=None, if strain_dict is not None: self.write_strain(strain_dict, group=group) - def write_injections(self, injection_file, ifo): - """ Writes injection parameters for an IFO to file. + def write_injections(self, injection_file): + """Writes injection parameters from the given injection file. + + Everything in the injection file is copied to ``injections_group``. Parameters ---------- injection_file : str Path to HDF injection file. - ifo : str - IFO name. """ - subgroup = "{ifo}/injections" - self.create_group(subgroup.format(ifo=ifo)) try: with h5py.File(injection_file, "r") as fp: - for param in fp.keys(): - self[subgroup.format(ifo=ifo)][param] = fp[param][:] - for key in fp.attrs.keys(): - self[subgroup.format(ifo=ifo)].attrs[key] = fp.attrs[key] + super(BaseInferenceFile, self).copy(fp, self.injections_group) except IOError: logging.warn("Could not read %s as an HDF file", injection_file) @@ -563,47 +428,6 @@ def write_command_line(self): previous = [] self.attrs["cmd"] = cmd + previous - def write_resume_point(self): - """Keeps a list of the number of iterations that were in a file when a - run was resumed from a checkpoint.""" - try: - resume_pts = self.attrs["resume_points"].tolist() - except KeyError: - resume_pts = [] - try: - niterations = self.niterations - except KeyError: - niterations = 0 - resume_pts.append(niterations) - self.attrs["resume_points"] = resume_pts - - def write_random_state(self, group=None, state=None): - """ Writes the state of the random number generator from the file. - - Parameters - ---------- - group : str - Name of group to read random state to. - state : tuple, optional - Specify the random state to write. If None, will use - ``numpy.random.get_state()``. - """ - group = self.sampler_group if group is None else group - dataset_name = "/".join([group, "random_state"]) - if state is None: - state = numpy.random.get_state() - s, arr, pos, has_gauss, cached_gauss = state - if group in self: - self[dataset_name][:] = arr - else: - self.create_dataset(dataset_name, arr.shape, fletcher32=True, - dtype=arr.dtype) - self[dataset_name][:] = arr - self[dataset_name].attrs["s"] = s - self[dataset_name].attrs["pos"] = pos - self[dataset_name].attrs["has_gauss"] = has_gauss - self[dataset_name].attrs["cached_gauss"] = cached_gauss - def get_slice(self, thin_start=None, thin_interval=None, thin_end=None): """Formats a slice using the given arguments that can be used to retrieve a thinned array from an InferenceFile. @@ -651,8 +475,7 @@ def get_slice(self, thin_start=None, thin_interval=None, thin_end=None): def copy_metadata(self, other): """Copies all metadata from this file to the other file. - Metadata is defined as all data that is not in either the samples or - stats group. + Metadata is defined as everything in the top-level ``.attrs``. Parameters ---------- @@ -660,70 +483,60 @@ def copy_metadata(self, other): An open inference file to write the data to. """ logging.info("Copying metadata") - # copy non-samples/stats data - for key in self.keys(): - if key not in [self.samples_group, self.stats_group]: - super(InferenceFile, self).copy(key, other) # copy attributes for key in self.attrs.keys(): other.attrs[key] = self.attrs[key] - def copy(self, other, parameters=None, parameter_names=None, - posterior_only=False, **kwargs): - """Copies data in this file to another file. + def copy_info(self, other, ignore=None): + """Copies "info" from this file to the other. - The samples and stats to copy may be down selected using the given - kwargs. All other data (the "metadata") are copied exactly. + "Info" is defined all groups that are not the samples group. Parameters ---------- - other : str or InferenceFile - The file to write to. May be either a string giving a filename, - or an open hdf file. If the former, the file will be opened with - the write attribute (note that if a file already exists with that - name, it will be deleted). + other : output file + The output file. Must be an hdf file. + ignore : (list of) str + Don't copy the given groups. + """ + logging.info("Copying info") + # copy non-samples/stats data + if ignore is None: + ignore = [] + if isinstance(ignore, (str, unicode)): + ignore = [ignore] + ignore = set(ignore + [self.samples_group]) + copy_groups = set(self.keys()) - ignore + for key in copy_groups: + super(BaseInferenceFile, self).copy(key, other) + + def copy_samples(self, other, parameters=None, parameter_names=None, + read_args=None, write_args=None): + """Should copy samples to the other files. + + Parameters + ---------- + other : InferenceFile + An open inference file to write to. parameters : list of str, optional List of parameters to copy. If None, will copy all parameters. parameter_names : dict, optional Rename one or more parameters to the given name. The dictionary should map parameter -> parameter name. If None, will just use the original parameter names. - posterior_only : bool, optional - Write the samples and model stats as flattened arrays, and - set other's posterior_only attribute. For example, if this file - has a parameter's samples written to - `{samples_group}/{param}/walker{x}`, then other will have all of - the selected samples from all walkers written to - `{samples_group}/{param}/`. - **kwargs : - All other keyword arguments are passed to `read_samples`. - - Returns - ------- - InferenceFile - The open file handler to other. + read_args : dict, optional + Arguments to pass to ``read_samples``. + write_args : dict, optional + Arguments to pass to ``write_samples``. """ - if not isinstance(other, h5py.File): - # check that we're not trying to overwrite this file - if other == self.name: - raise IOError("destination is the same as this file") - other = InferenceFile(other, 'w') - # copy metadata over - self.copy_metadata(other) - # update other's posterior attribute - if posterior_only: - other.attrs['posterior_only'] = posterior_only # select the samples to copy logging.info("Reading samples to copy") if parameters is None: parameters = self.variable_params - # if list of desired parameters is different, rename model params + # if list of desired parameters is different, rename if set(parameters) != set(self.variable_params): other.attrs['variable_params'] = parameters - # if only the posterior is desired, we'll flatten the results - if not posterior_only and not self.posterior_only: - kwargs['flatten'] = False - samples = self.read_samples(parameters, **kwargs) + samples = self.read_samples(parameters, **read_args) logging.info("Copying {} samples".format(samples.size)) # if different parameter names are desired, get them from the samples if parameter_names: @@ -733,23 +546,67 @@ def copy(self, other, parameters=None, parameter_names=None, samples = FieldArray.from_kwargs(**arrs) other.attrs['variable_params'] = samples.fieldnames logging.info("Writing samples") - other.samples_parser.write_samples_group(other, self.samples_group, - samples.fieldnames, samples) - # do the same for the model stats - logging.info("Reading stats to copy") - stats = self.read_model_stats(**kwargs) - logging.info("Writing stats") - other.samples_parser.write_samples_group(other, self.stats_group, - stats.fieldnames, stats) + other.write_samples(other, samples, **write_args) + + def copy(self, other, ignore=None, parameters=None, parameter_names=None, + read_args=None, write_args=None): + """Copies metadata, info, and samples in this file to another file. + + Parameters + ---------- + other : str or InferenceFile + The file to write to. May be either a string giving a filename, + or an open hdf file. If the former, the file will be opened with + the write attribute (note that if a file already exists with that + name, it will be deleted). + ignore : (list of) strings + Don't copy the given groups. If the samples group is included, no + samples will be copied. + parameters : list of str, optional + List of parameters in the samples group to copy. If None, will copy + all parameters. + parameter_names : dict, optional + Rename one or more parameters to the given name. The dictionary + should map parameter -> parameter name. If None, will just use the + original parameter names. + read_args : dict, optional + Arguments to pass to ``read_samples``. + write_args : dict, optional + Arguments to pass to ``write_samples``. + + Returns + ------- + InferenceFile + The open file handler to other. + """ + if not isinstance(other, h5py.File): + # check that we're not trying to overwrite this file + if other == self.name: + raise IOError("destination is the same as this file") + other = InferenceFile(other, 'w') + # metadata + self.copy_metadata(other) + # info + if ignore is None: + ignore = [] + if isinstance(ignore, (str, unicode)): + ignore = [ignore] + self.copy_info(other, ignore=ignore) + # samples + if self.samples_group not in ignore: + self.copy_samples(other, parameters=parameters, + parameter_names=parameter_names, + read_args=read_args, + write_args=write_args) # if any down selection was done, re-set the burn in iterations and # the acl, and the niterations. # The last dimension of the samples returned by the sampler should # be the number of iterations. - if samples.shape[-1] != self.niterations: - other.attrs['acl'] = 1 - other.attrs['burn_in_iterations'] = 0 - other.attrs['niterations'] = samples.shape[-1] - return other + #if samples.shape[-1] != self.niterations: + # other.attrs['acl'] = 1 + # other.attrs['burn_in_iterations'] = 0 + # other.attrs['niterations'] = samples.shape[-1] + #return other def check_integrity(filename): From cef9e8cb917befbf3f52238357b1851c5a0e39eb Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Thu, 12 Jul 2018 18:51:28 +0200 Subject: [PATCH 03/47] rename hdf.py base_hdf.py --- gwin/io/{hdf.py => base_hdf.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename gwin/io/{hdf.py => base_hdf.py} (100%) diff --git a/gwin/io/hdf.py b/gwin/io/base_hdf.py similarity index 100% rename from gwin/io/hdf.py rename to gwin/io/base_hdf.py From 69721023b254702805c420fc4671e1e8899a7339 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Thu, 12 Jul 2018 19:22:36 +0200 Subject: [PATCH 04/47] add parse_parameters function --- gwin/io/base_hdf.py | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index a89172e..d92d3f1 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -41,7 +41,6 @@ from .. import sampler as gwin_sampler - class BaseInferenceFile(h5py.File): """ A subclass of the h5py.File object that has extra functions for handling reading and writing the samples from the samplers. @@ -92,6 +91,43 @@ def write_samples(self, samples, **kwargs): """ pass + def parse_parameters(self, parameters, array_class=None): + """Parses a parameters arg to figure out what fields need to be loaded. + + Parameters + ---------- + parameters : (list of) strings + The parameter(s) to retrieve. A parameter can be the name of any + field in ``samples_group``, a virtual field or method of + ``FieldArray`` (as long as the file contains the necessary fields + to derive the virtual field or method), and/or a function of + these. + array_class : array class, optional + The type of array to use to parse the parameters. The class must have a + ``parse_parameters`` method. Default is to use a ``FieldArray``. + + Returns + ------- + list : + A list of strings giving the fields to load from the file. + """ + # get the type of array class to use + if array_class is None: + array_class = FieldArray + # get the names of fields needed for the given parameters + possible_fields = self[self.samples_group].keys() + return array_class.parse_parameters(parameters, possible_fields) + + def _parse_parameters(self, parameters, **kwargs): + """Decorator function for read samples that calls parse parameters. + """ + array_class = kwargs.pop('array_class', None) + def dostuff(parameters, **kwargs): + parameters = self.parse_parameters(parameters, array_class) + return self.read_samples(parameters, **kwargs) + return dostuff + + @_parse_parameters @abstractmethod def read_samples(self, parameters, **kwargs): """This should read the requested parameters. From 7c7e6153838bffc3363fab11d520394c0158f8fe Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Thu, 12 Jul 2018 19:22:54 +0200 Subject: [PATCH 05/47] add module for base mcmc io --- gwin/io/base_mcmc.py | 254 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) create mode 100644 gwin/io/base_mcmc.py diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py new file mode 100644 index 0000000..545e8e0 --- /dev/null +++ b/gwin/io/base_mcmc.py @@ -0,0 +1,254 @@ +# Copyright (C) 2016 Christopher M. Biwer, Collin Capano +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# self.option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +"""Provides I/O that is specific to MCMC samplers. +""" + +import os +import sys +import logging +from abc import ABCMeta + +import numpy + +import h5py + +from pycbc import DYN_RANGE_FAC +from pycbc.io import FieldArray +from pycbc.types import FrequencySeries +from pycbc.waveform import parameters as wfparams + +from .hdf import InferenceFile + +class EnsembleMCMCIO(obect): + + __metaclass__ = ABCMeta + + @abstractmethod + def read_acls(self): + """Should return all of the individual chains' acls. + """ + pass + + def write_mcmc_metadata(self, sampler): + """Writes metadata unique to an ensemble MCMC. + + Parameters + ---------- + sampler : gwin.sampler + An instance of a gwin sampler. + """ + self.attrs["niterations"] = sampler.niterations + self.attrs["nwalkers"] = sampler.nwalkers + + def write_samples(self, parameters, samples, + start_iteration=None, max_iterations=None): + """Writes samples to the given file. + + Results are written to: + + ``fp[samples_group/{vararg}]``, + + where ``{vararg}`` is the name of a model params. The samples are + written as an ``nwalkers x niterations`` array. + + Parameters + ----------- + fp : InferenceFile + A file handler to an open inference file. + samples_group : str + Name of samples group to write. + parameters : list + The parameters to write to the file. + samples : FieldArray + The samples to write. Should be a FieldArray with fields containing + the samples to write and shape nwalkers x niterations. + start_iteration : int, optional + Write results to the file's datasets starting at the given + iteration. Default is to append after the last iteration in the + file. + max_iterations : int, optional + Set the maximum size that the arrays in the hdf file may be resized + to. Only applies if the samples have not previously been written + to file. The default (None) is to use the maximum size allowed by + h5py. + """ + nwalkers, niterations = samples.shape + if max_iterations is not None and max_iterations < niterations: + raise IndexError("The provided max size is less than the " + "number of iterations") + group = samples_group + '/{name}' + # loop over number of dimensions + for param in parameters: + dataset_name = group.format(name=param) + istart = start_iteration + try: + fp_niterations = fp[dataset_name].shape[-1] + if istart is None: + istart = fp_niterations + istop = istart + niterations + if istop > fp_niterations: + # resize the dataset + fp[dataset_name].resize(istop, axis=1) + except KeyError: + # dataset doesn't exist yet + if istart is not None and istart != 0: + raise ValueError("non-zero start_iteration provided, " + "but dataset doesn't exist yet") + istart = 0 + istop = istart + niterations + fp.create_dataset(dataset_name, (nwalkers, istop), + maxshape=(nwalkers, max_iterations), + dtype=float, fletcher32=True) + fp[dataset_name][:, istart:istop] = samples[param] + + def read_samples(self, parameters, + thin_start=None, thin_interval=None, thin_end=None, + iteration=None, walkers=None, flatten=True, + array_class=None): + """Reads samples for the given parameter(s). + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + parameters : (list of) strings + The parameter(s) to retrieve. A parameter can be the name of any + field in `fp[fp.samples_group]`, a virtual field or method of + `FieldArray` (as long as the file contains the necessary fields + to derive the virtual field or method), and/or a function of + these. + thin_start : int + Index of the sample to begin returning samples. Default is to read + samples after burn in. To start from the beginning set thin_start + to 0. + thin_interval : int + Interval to accept every i-th sample. Default is to use the + `fp.acl`. If `fp.acl` is not set, then use all samples + (set thin_interval to 1). + thin_end : int + Index of the last sample to read. If not given then + `fp.niterations` is used. + iteration : int + Get a single iteration. If provided, will override the + `thin_{start/interval/end}` arguments. + walkers : {None, (list of) int} + The walker index (or a list of indices) to retrieve. If None, + samples from all walkers will be obtained. + flatten : {True, bool} + The returned array will be one dimensional, with all desired + samples from all desired walkers concatenated together. If False, + the returned array will have dimension requested walkers + x requested iterations. + samples_group : {None, str} + The group in `fp` from which to retrieve the parameter fields. If + None, searches in `fp.samples_group`. + array_class : {None, array class} + The type of array to return. The class must have a `from_kwargs` + class method and a `parse_parameters` method. If None, will return + a FieldArray. + + Returns + ------- + array_class + Samples for the given parameters, as an instance of a the given + `array_class` (`FieldArray` if `array_class` is None). + """ + # get the type of array class to use + if array_class is None: + array_class = FieldArray + # get the names of fields needed for the given parameters + possible_fields = self[self.samples_group].keys() + loadfields = array_class.parse_parameters(parameters, possible_fields) + return self._read_fields(loadfields, array_class, + thin_start=thin_start, + thin_interval=thin_interval, thin_end=thin_end, + iteration=iteration, walkers=walkers, + flatten=flatten) + + def _read_fields(self, fields, array_class, + thin_start=None, thin_interval=None, thin_end=None, + iteration=None, walkers=None, flatten=True): + """Base function for reading samples and model stats. See + `read_samples` and `read_model_stats` for details. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + fields_group : str + The name of the group to retrieve the desired fields. + fields : list + The list of field names to retrieve. Must be names of groups in + `fp[fields_group/]`. + array_class : FieldArray or similar + The type of array to return. Must have a `from_kwargs` attribute. + + For other details on keyword arguments, see `read_samples` and + `read_model_stats`. + + Returns + ------- + array_class + An instance of the given array class populated with values + retrieved from the fields. + """ + # walkers to load + if walkers is not None: + widx = numpy.zeros(fp.nwalkers, dtype=bool) + widx[walkers] = True + else: + widx = slice(0, None) + # get the slice to use + if iteration is not None: + get_index = iteration + else: + if thin_end is None: + # use the number of current iterations + thin_end = fp.niterations + get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end, + thin_interval=thin_interval) + # load + arrays = {} + group = fields_group + '/{name}' + for name in fields: + arr = fp[group.format(name=name)][widx, get_index] + if flatten: + arr = arr.flatten() + arrays[name] = arr + return array_class.from_kwargs(**arrays) + + def write_resume_point(self): + """Keeps a list of the number of iterations that were in a file when a + run was resumed from a checkpoint.""" + try: + resume_pts = self.attrs["resume_points"].tolist() + except KeyError: + resume_pts = [] + try: + niterations = self.niterations + except KeyError: + niterations = 0 + resume_pts.append(niterations) + self.attrs["resume_points"] = resume_pts + From 214609a1c34329b53013644a23c4411bc13dad05 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Fri, 13 Jul 2018 10:44:20 +0200 Subject: [PATCH 06/47] make _read_samples_data the abstract method --- gwin/io/base_hdf.py | 77 +++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 27 deletions(-) diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index d92d3f1..6bd7864 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -28,7 +28,7 @@ import os import sys import logging -from abc import ABCMeta +from abc import ABCMeta, abstractmethod, abstractproperty import numpy @@ -42,7 +42,9 @@ from .. import sampler as gwin_sampler class BaseInferenceFile(h5py.File): - """ A subclass of the h5py.File object that has extra functions for + """Base class for all inference hdf files. + + This is a subclass of the h5py.File object. It adds functions for handling reading and writing the samples from the samplers. Parameters @@ -61,7 +63,7 @@ class BaseInferenceFile(h5py.File): injections_group = 'injections' def __init__(self, path, mode=None, **kwargs): - super(InferenceFile, self).__init__(path, mode, **kwargs) + super(BaseInferenceFile, self).__init__(path, mode, **kwargs) def __getattr__(self, attr): """Things stored in ``.attrs`` are promoted to instance attributes. @@ -82,10 +84,8 @@ def write_samples(self, samples, **kwargs): ---------- fp : open hdf file The file to write to. - samples : structure array-like - Samples should be provided as a numpy structure array or a - FieldArray (basically, anything for which ``samples['param']`` will - return a numpy array). + samples : dict + Samples should be provided as a dictionary of numpy arrays. \**kwargs : Any other keyword args the sampler needs to write data. """ @@ -118,35 +118,58 @@ def parse_parameters(self, parameters, array_class=None): possible_fields = self[self.samples_group].keys() return array_class.parse_parameters(parameters, possible_fields) - def _parse_parameters(self, parameters, **kwargs): - """Decorator function for read samples that calls parse parameters. - """ - array_class = kwargs.pop('array_class', None) - def dostuff(parameters, **kwargs): - parameters = self.parse_parameters(parameters, array_class) - return self.read_samples(parameters, **kwargs) - return dostuff + def read_samples(self, parameters, array_class=None, **kwargs): + """Reads samples for the given parameter(s). - @_parse_parameters - @abstractmethod - def read_samples(self, parameters, **kwargs): - """This should read the requested parameters. + The ``parameters`` can be the name of any dataset in ``samples_group``, + a virtual field or method of ``FieldArray`` (as long as the file + contains the necessary fields to derive the virtual field or method), + and/or any numpy function of these. - The samples should be returned as a ``FieldArray``. + The ``parameters`` are parsed to figure out what datasets are needed. + Only those datasets will be loaded, and will be the base-level fields + of the returned ``FieldArray``. + + The ``static_params`` are also added as attributes of the returned + ``FieldArray``. Parameters - ---------- - fp : open hdf file - The file to read from. - parameters : list of str - List of the parameters to return. May include functions. + ----------- + fp : InferenceFile + An open file handler to read the samples from. + parameters : (list of) strings + The parameter(s) to retrieve. + array_class : FieldArray-like class, optional + The type of array to return. The class must have ``from_kwargs`` + and ``parse_parameters`` methods. If None, will return a + ``FieldArray``. \**kwargs : - Any other keyword args the sampler needs to read data. + All other keyword arguments are passed to ``_read_samples_data``. Returns ------- FieldArray : - The samples as a FieldArray. + The samples as a ``FieldArray``. + """ + # get the type of array class to use + if array_class is None: + array_class = FieldArray + # get the names of fields needed for the given parameters + possible_fields = self[self.samples_group].keys() + loadfields = array_class.parse_parameters(parameters, possible_fields) + samples = self._read_samples_data(loadfields, **kwargs) + # convert to FieldArray + samples = array_class.from_kwargs(**samples) + # add the static params + for p,val in self.static_params.items(): + setattr(samples, p, val) + return samples + + @abstractmethod + def _read_samples_data(self, fields, **kwargs): + """Low level function for reading datasets in the samples group. + + This should return a dictionary of numpy arrays. """ pass From 9e10e08e14d2c78766d929ab8e6e60b978e97cd2 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Fri, 13 Jul 2018 10:45:34 +0200 Subject: [PATCH 07/47] added read_samples_data to base_mcmc --- gwin/io/base_mcmc.py | 115 ++++++++----------------------------------- 1 file changed, 20 insertions(+), 95 deletions(-) diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py index 545e8e0..a597c6d 100644 --- a/gwin/io/base_mcmc.py +++ b/gwin/io/base_mcmc.py @@ -41,7 +41,8 @@ from .hdf import InferenceFile class EnsembleMCMCIO(obect): - + """Abstract base class that provides some IO functions for ensemble MCMCs. + """ __metaclass__ = ABCMeta @abstractmethod @@ -74,15 +75,11 @@ def write_samples(self, parameters, samples, Parameters ----------- - fp : InferenceFile - A file handler to an open inference file. - samples_group : str - Name of samples group to write. parameters : list The parameters to write to the file. - samples : FieldArray - The samples to write. Should be a FieldArray with fields containing - the samples to write and shape nwalkers x niterations. + samples : dict + The samples to write. Each array in the dictionary should have + shape nwalkers x niterations. start_iteration : int, optional Write results to the file's datasets starting at the given iteration. Default is to append after the last iteration in the @@ -93,11 +90,14 @@ def write_samples(self, parameters, samples, to file. The default (None) is to use the maximum size allowed by h5py. """ - nwalkers, niterations = samples.shape + nwalkers, niterations = samples.values()[0].shape + assert(all(p.shape == (nwalkers, niterations) + for p in samples.values()), + "all samples must have the same shape") if max_iterations is not None and max_iterations < niterations: raise IndexError("The provided max size is less than the " "number of iterations") - group = samples_group + '/{name}' + group = self.samples_group + '/{name}' # loop over number of dimensions for param in parameters: dataset_name = group.format(name=param) @@ -122,96 +122,21 @@ def write_samples(self, parameters, samples, dtype=float, fletcher32=True) fp[dataset_name][:, istart:istop] = samples[param] - def read_samples(self, parameters, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None, walkers=None, flatten=True, - array_class=None): - """Reads samples for the given parameter(s). + def _read_samples_data(self, fields, + thin_start=None, thin_interval=None, thin_end=None, + iteration=None, walkers=None, flatten=True): + """Base function for reading samples. Parameters ----------- - fp : InferenceFile - An open file handler to read the samples from. - parameters : (list of) strings - The parameter(s) to retrieve. A parameter can be the name of any - field in `fp[fp.samples_group]`, a virtual field or method of - `FieldArray` (as long as the file contains the necessary fields - to derive the virtual field or method), and/or a function of - these. - thin_start : int - Index of the sample to begin returning samples. Default is to read - samples after burn in. To start from the beginning set thin_start - to 0. - thin_interval : int - Interval to accept every i-th sample. Default is to use the - `fp.acl`. If `fp.acl` is not set, then use all samples - (set thin_interval to 1). - thin_end : int - Index of the last sample to read. If not given then - `fp.niterations` is used. - iteration : int - Get a single iteration. If provided, will override the - `thin_{start/interval/end}` arguments. - walkers : {None, (list of) int} - The walker index (or a list of indices) to retrieve. If None, - samples from all walkers will be obtained. - flatten : {True, bool} - The returned array will be one dimensional, with all desired - samples from all desired walkers concatenated together. If False, - the returned array will have dimension requested walkers - x requested iterations. - samples_group : {None, str} - The group in `fp` from which to retrieve the parameter fields. If - None, searches in `fp.samples_group`. - array_class : {None, array class} - The type of array to return. The class must have a `from_kwargs` - class method and a `parse_parameters` method. If None, will return - a FieldArray. - - Returns - ------- - array_class - Samples for the given parameters, as an instance of a the given - `array_class` (`FieldArray` if `array_class` is None). - """ - # get the type of array class to use - if array_class is None: - array_class = FieldArray - # get the names of fields needed for the given parameters - possible_fields = self[self.samples_group].keys() - loadfields = array_class.parse_parameters(parameters, possible_fields) - return self._read_fields(loadfields, array_class, - thin_start=thin_start, - thin_interval=thin_interval, thin_end=thin_end, - iteration=iteration, walkers=walkers, - flatten=flatten) - - def _read_fields(self, fields, array_class, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None, walkers=None, flatten=True): - """Base function for reading samples and model stats. See - `read_samples` and `read_model_stats` for details. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - fields_group : str - The name of the group to retrieve the desired fields. fields : list - The list of field names to retrieve. Must be names of groups in - `fp[fields_group/]`. - array_class : FieldArray or similar - The type of array to return. Must have a `from_kwargs` attribute. - - For other details on keyword arguments, see `read_samples` and - `read_model_stats`. + The list of field names to retrieve. Must be names of datasets in + the ``samples_group``. Returns ------- - array_class - An instance of the given array class populated with values - retrieved from the fields. + dict + A dictionary of field name -> numpy array pairs. """ # walkers to load if walkers is not None: @@ -229,14 +154,14 @@ def _read_fields(self, fields, array_class, get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end, thin_interval=thin_interval) # load + group = self.samples_group + '/{name}' arrays = {} - group = fields_group + '/{name}' for name in fields: arr = fp[group.format(name=name)][widx, get_index] if flatten: arr = arr.flatten() arrays[name] = arr - return array_class.from_kwargs(**arrays) + return arrays def write_resume_point(self): """Keeps a list of the number of iterations that were in a file when a From af6e7b9510222612e1e50bf6c52bca0df23e7557 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Fri, 13 Jul 2018 10:47:41 +0200 Subject: [PATCH 08/47] add emcee file handling --- gwin/io/emcee.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 gwin/io/emcee.py diff --git a/gwin/io/emcee.py b/gwin/io/emcee.py new file mode 100644 index 0000000..c127da5 --- /dev/null +++ b/gwin/io/emcee.py @@ -0,0 +1,75 @@ +# Copyright (C) 2018 Collin Capano +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# self.option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +"""Provides IO for the emcee sampler. +""" + +from .base_hdf import BaseInferenceFile +from .base_mcmc import EnsembleMCMCIO + +class EmceeFile(EnsembleMCMCIO, BaseInferenceFile): + """Class to handle file IO for the ``emcee`` sampler.""" + + name = 'emcee_file' + + def read_acceptance_fraction(self, walkers=None): + """Reads the acceptance fraction from the given file. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + walkers : {None, (list of) int} + The walker index (or a list of indices) to retrieve. If None, + samples from all walkers will be obtained. + + Returns + ------- + array + Array of acceptance fractions with shape (requested walkers,). + """ + group = self.sampler_group + '/acceptance_fraction' + if walkers is None: + wmask = numpy.ones(self.nwalkers, dtype=bool) + else: + wmask = numpy.zeros(self.nwalkers, dtype=bool) + wmask[walkers] = True + return self[group][wmask] + + def write_acceptance_fraction(self, acceptance_fraction): + """Write acceptance_fraction data to file. Results are written to + `fp[acceptance_fraction]`. + + Parameters + ----------- + fp : InferenceFile + A file handler to an open inference file. + """ + group = self.sampler_group + '/acceptance_fraction' + try: + self[group][:] = acceptance_fraction + except KeyError: + # dataset doesn't exist yet, create it + self[group] = acceptance_fraction + + From b089dca8702d589ca873fa3a4878a05777e7bfd2 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Fri, 13 Jul 2018 10:53:08 +0200 Subject: [PATCH 09/47] replace read/write functions with io in BaseSampler --- gwin/sampler/base.py | 62 ++++++-------------------------------------- 1 file changed, 8 insertions(+), 54 deletions(-) diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py index a7a8fad..542f040 100644 --- a/gwin/sampler/base.py +++ b/gwin/sampler/base.py @@ -25,7 +25,7 @@ Defines the base sampler class to be inherited by all samplers. """ -from abc import ABCMeta +from abc import ABCMeta, abstractmethod, abstractproperty import numpy from pycbc.io import FieldArray from pycbc.filter import autocorrelation @@ -102,58 +102,12 @@ def run(self): """ pass - @abstractmethod - def write_samples(cls, fp, samples, group="samples", **kwargs): - """This should write all of the provided samples to the given hdf file. - - This function should be used to write both samples and model stats. - - Parameters - ---------- - fp : open hdf file - The file to write to. - samples : structure array-like - Samples should be provided as a numpy structure array or a - FieldArray (basically, anything for which ``samples['param']`` will - return a numpy array). - group : str, optional - The group in ``fp`` to write the ``samples`` to. Default is - "samples". - \**kwargs : - Any other keyword args the sampler needs to write data. - """ - pass - - @abstractmethod - def read_samples(cls, fp, parameters, group="samples", **kwargs): - """This should read the requested parameters from the given hdf file. - - The samples should be returned as a ``FieldArray``. - - Parameters - ---------- - fp : open hdf file - The file to read from. - parameters : list of str - List of the parameters to return. May include functions. - group : str, optional - The group in ``fp`` to read the ``samples`` from. Default is - "samples". - \**kwargs : - Any other keyword args the sampler needs to read data. - """ - pass - - @abstractmethod - def write_posterior(cls, posterior_fp, **kwargs): - """This should write a posterior plus any other metadata to the given - file. - - Parameters - ---------- - posterior_fp : open hdf file - The file to write to. - \**kwargs : - Any other keyword args the sampler needs to write the posterior. + @abstractproperty + def io(self): + """A class that inherits from ``BaseInferenceFile`` to handle IO with + an hdf file. + + This should be a class, not an instance of class, so that the sampler + can initialize it when needed. """ pass From 137dc140949855ae3dc67bb599ef11a1b96db359 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Fri, 13 Jul 2018 16:41:08 +0200 Subject: [PATCH 10/47] add checkpoint requirement; rename samples raw_samples --- gwin/sampler/base.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py index 542f040..38b5029 100644 --- a/gwin/sampler/base.py +++ b/gwin/sampler/base.py @@ -81,16 +81,20 @@ def static_params(self): return self.model.static_params @abstractproperty - def samples(self): - """Should return all of the samples currently stored in memory as a - numpy structure array or FieldArray. + def raw_samples(self): + """A dict mapping sampling_params to arrays of samples currently + in memory. + + The sample arrays may have any shape, and may or may not be thinned. """ pass @abstractproperty def model_stats(self): - """Should return all of the model's metadata currently stored in - memory as a numpy structure array or FieldArray. + """A dict mapping model's metadata fields to arrays of values for + each sample in ``raw_samples``. + + The arrays may have any shape, and may or may not be thinned. """ pass @@ -111,3 +115,10 @@ def io(self): can initialize it when needed. """ pass + + @abstractmethod + def checkpoint(self): + """The sampler must have a checkpoint method for dumping raw samples + and stats to the file type defined by ``io``. + """ + pass From be9b8de51331d77e365a9d85dd467ff571e979c4 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Fri, 13 Jul 2018 16:41:42 +0200 Subject: [PATCH 11/47] start updating emcee --- gwin/sampler/emcee.py | 777 +++--------------------------------------- 1 file changed, 43 insertions(+), 734 deletions(-) diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py index 97786b5..7cad975 100644 --- a/gwin/sampler/emcee.py +++ b/gwin/sampler/emcee.py @@ -43,7 +43,7 @@ # ============================================================================= # -class EmceeEnsembleSampler(BaseMCMCSampler): +class EmceeEnsembleSampler(BaseMCMC, BaseSampler): """This class is used to construct an MCMC sampler from the emcee package's EnsembleSampler. @@ -78,9 +78,7 @@ def __init__(self, model, nwalkers, pool=None, # to have the same state as the numpy generator rstate = numpy.random.get_state() sampler.random_state = rstate - # initialize - super(EmceeEnsembleSampler, self).__init__( - sampler, model) + self._sampler = sampler self._nwalkers = nwalkers @classmethod @@ -104,6 +102,47 @@ def from_cli(cls, opts, model, pool=None, return cls(model, opts.nwalkers, pool=pool, model_call=model_call) + @property + def raw_samples(self): + """A dict mapping sampling_params to arrays of samples currently + in memory. + + The arrays have shape ``nwalkers`` x ``niterations``. + """ + # chain is a [additional dimensions x] niterations x ndim array + samples = self.chain + sampling_params = self.sampling_params + # convert to dictionary to apply boundary conditions + samples = {param: samples[..., ii] for + ii, param in enumerate(sampling_params)} + samples = self.model._prior.apply_boundary_conditions( + **samples) + # now convert to field array + samples = FieldArray.from_arrays([samples[param] + for param in sampling_params], + names=sampling_params) + # apply transforms to go to model params space + return self.model.apply_sampling_transforms( + samples, inverse=True) + + @property + def model_stats(self): + """Returns the model stats as a FieldArray, with field names + corresponding to the type of data returned by the model. + The returned array has shape nwalkers x niterations. If no additional + stats were returned to the sampler by the model, returns + None. + """ + stats = numpy.array(self._sampler.blobs) + if stats.size == 0: + return None + # we'll force arrays to float; this way, if there are `None`s in the + # blobs, they will be changed to `nan`s + arrays = {field: stats[..., fi].astype(float) + for fi, field in + enumerate(self.model.metadata_fields)} + return FieldArray.from_kwargs(**arrays).transpose() + @property def lnpost(self): """Get the natural logarithm of the likelihood as an @@ -222,733 +261,3 @@ def write_results(self, fp, start_iteration=None, max_iterations=max_iterations) self.write_acceptance_fraction(fp) self.write_state(fp) - - -# This is needed for two reason -# 1) pools freeze state when created and so classes *cannot be updated* -# 2) methods cannot be pickled. -class _callprior(object): - """Calls the model's prior function, and ensures that no - metadata is returned.""" - def __init__(self, model_call): - self.callable = model_call - - def __call__(self, args): - prior = self.callable(args, callstat='logprior', - return_all_stats=False) - return prior - - -class _callloglikelihood(object): - """Calls the model's loglikelihood function. - """ - def __init__(self, model_call): - self.callable = model_call - - def __call__(self, args): - return self.callable(args, callstat='loglikelihood', - return_all_stats=False) - - -class EmceePTSampler(BaseMCMCSampler): - """This class is used to construct a parallel-tempered MCMC sampler from - the emcee package's PTSampler. - - Parameters - ---------- - model : model - A model from ``gwin.models``. - ntemps : int - Number of temeratures to use in the sampler. - nwalkers : int - Number of walkers to use in sampler. - pool : function with map, Optional - A provider of a map function that allows a function call to be run - over multiple sets of arguments and possibly maps them to - cores/nodes/etc. - """ - name = "emcee_pt" - - def __init__(self, model, ntemps, nwalkers, pool=None, - model_call=None): - - try: - import emcee - except ImportError: - raise ImportError("emcee is not installed.") - - if model_call is None: - model_call = model - - # construct the sampler: PTSampler needs the likelihood and prior - # functions separately - ndim = len(model.variable_params) - sampler = emcee.PTSampler(ntemps, nwalkers, ndim, - _callloglikelihood(model_call), - _callprior(model_call), - pool=pool) - # initialize - super(EmceePTSampler, self).__init__( - sampler, model) - self._nwalkers = nwalkers - self._ntemps = ntemps - - @classmethod - def from_cli(cls, opts, model, pool=None, - model_call=None): - """Create an instance of this sampler from the given command-line - options. - - Parameters - ---------- - opts : ArgumentParser options - The options to parse. - model : LikelihoodEvaluator - The model to use with the sampler. - - Returns - ------- - EmceePTSampler - An emcee sampler initialized based on the given arguments. - """ - return cls(model, opts.ntemps, opts.nwalkers, - pool=pool, model_call=model_call) - - @property - def ntemps(self): - return self._ntemps - - @property - def chain(self): - """Get all past samples as an ntemps x nwalker x niterations x ndim - array. - """ - # emcee returns the chain as ntemps x nwalker x niterations x ndim - return self._sampler.chain - - def clear_chain(self): - """Clears the chain and blobs from memory. - """ - # store the iteration that the clear is occuring on - self.lastclear = self.niterations - # now clear the chain - self._sampler.reset() - - @property - def model_stats(self): - """Returns the log likelihood ratio and log prior as a FieldArray. - The returned array has shape ntemps x nwalkers x niterations. - """ - # likelihood has shape ntemps x nwalkers x niterations - logl = self._sampler.lnlikelihood - # get prior from posterior - logp = self._sampler.lnprobability - logl - # compute the likelihood ratio - loglr = logl - self.model.lognl - kwargs = {'loglr': loglr, 'logprior': logp} - # if different coordinates were used for sampling, get the jacobian - if self.model.sampling_transforms is not None: - samples = self.samples - # convert to dict - d = {param: samples[param] for param in samples.fieldnames} - logj = self.model.logjacobian(**d) - kwargs['logjacobian'] = logj - return FieldArray.from_kwargs(**kwargs) - - @property - def lnpost(self): - """Get the natural logarithm of the likelihood + the prior as an - ntemps x nwalkers x niterations array. - """ - # emcee returns ntemps x nwalkers x niterations - return self._sampler.lnprobability - - def set_p0(self, samples_file=None, prior=None): - """Sets the initial position of the walkers. - - Parameters - ---------- - samples_file : InferenceFile, optional - If provided, use the last iteration in the given file for the - starting positions. - prior : JointDistribution, optional - Use the given prior to set the initial positions rather than - ``model``'s prior. - - Returns - ------- - p0 : array - An ntemps x nwalkers x ndim array of the initial positions that - were set. - """ - # create a (nwalker, ndim) array for initial positions - ntemps = self.ntemps - nwalkers = self.nwalkers - ndim = len(self.variable_params) - p0 = numpy.ones((ntemps, nwalkers, ndim)) - # if samples are given then use those as initial positions - if samples_file is not None: - samples = self.read_samples(samples_file, self.variable_params, - iteration=-1, temps='all', - flatten=False)[..., 0] - # transform to sampling parameter space - samples = self.model.apply_sampling_transforms( - samples) - # draw random samples if samples are not provided - else: - samples = self.model.prior_rvs( - size=nwalkers*ntemps, prior=prior).reshape((ntemps, nwalkers)) - # convert to array - for i, param in enumerate(self.sampling_params): - p0[..., i] = samples[param] - self._p0 = p0 - return p0 - - def run(self, niterations, **kwargs): - """Advance the ensemble for a number of samples. - - Parameters - ---------- - niterations : int - Number of samples to get from sampler. - - Returns - ------- - p : numpy.array - An array of current walker positions with shape (nwalkers, ndim). - lnpost : numpy.array - The list of log posterior probabilities for the walkers at - positions p, with shape (nwalkers, ndim). - rstate : - The current state of the random number generator. - """ - pos = self._pos - if pos is None: - pos = self.p0 - res = self._sampler.run_mcmc(pos, niterations, **kwargs) - p, lnpost, rstate = res[0], res[1], res[2] - # update the positions - self._pos = p - return p, lnpost, rstate - - # read/write functions - - # add ntemps and betas to metadata - def write_metadata(self, fp, **kwargs): - """Writes metadata about this sampler to the given file. Metadata is - written to the file's `attrs`. - - Parameters - ---------- - fp : InferenceFile - A file handler to an open inference file. - **kwargs : - All keyword arguments are saved as separate arguments in the - file attrs. If any keyword argument is a dictionary, the keyword - will point to the list of keys in the the file's ``attrs``. Each - key is then stored as a separate attr with its corresponding value. - """ - super(EmceePTSampler, self).write_metadata(fp, **kwargs) - fp.attrs["ntemps"] = self.ntemps - fp.attrs["betas"] = self._sampler.betas - - def write_acceptance_fraction(self, fp): - """Write acceptance_fraction data to file. Results are written to - `fp[acceptance_fraction/temp{k}]` where k is the temperature. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - """ - group = "acceptance_fraction/temp{tk}" - # acf has shape ntemps x nwalkers - acf = self.acceptance_fraction - for tk in range(fp.ntemps): - try: - fp[group.format(tk=tk)][:] = acf[tk, :] - except KeyError: - # dataset doesn't exist yet, create it - fp[group.format(tk=tk)] = acf[tk, :] - - @staticmethod - def read_acceptance_fraction(fp, temps=None, walkers=None): - """Reads the acceptance fraction from the given file. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - temps : {None, (list of) int} - The temperature index (or a list of indices) to retrieve. If None, - acfs from all temperatures and all walkers will be retrieved. - walkers : {None, (list of) int} - The walker index (or a list of indices) to retrieve. If None, - samples from all walkers will be obtained. - - Returns - ------- - array - Array of acceptance fractions with shape (requested temps, - requested walkers). - """ - group = 'acceptance_fraction/temp{tk}' - if temps is None: - temps = numpy.arange(fp.ntemps) - if walkers is None: - wmask = numpy.ones(fp.nwalkers, dtype=bool) - else: - wmask = numpy.zeros(fp.nwalkers, dtype=bool) - wmask[walkers] = True - arrays = [] - for tk in temps: - arrays.extend(fp[group.format(tk=tk)][wmask]) - return arrays - - @staticmethod - def write_samples_group(fp, samples_group, parameters, samples, - start_iteration=None, max_iterations=None): - """Writes samples to the given file. - - Results are written to: - - ``fp[samples_group/{vararg}]``, - - where ``{vararg}`` is the name of a variable arg. The samples are - written as an ``ntemps x nwalkers x niterations`` array. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - samples_group : str - Name of samples group to write. - parameters : list - The parameters to write to the file. - samples : FieldArray - The samples to write. Should be a FieldArray with fields containing - the samples to write and shape nwalkers x niterations. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. - """ - ntemps, nwalkers, niterations = samples.shape - if max_iterations is not None and max_iterations < niterations: - raise IndexError("The provided max size is less than the " - "number of iterations") - group = samples_group + '/{name}' - # loop over number of dimensions - for param in parameters: - dataset_name = group.format(name=param) - istart = start_iteration - try: - fp_niterations = fp[dataset_name].shape[-1] - if istart is None: - istart = fp_niterations - istop = istart + niterations - if istop > fp_niterations: - # resize the dataset - fp[dataset_name].resize(istop, axis=2) - except KeyError: - # dataset doesn't exist yet - if istart is not None and istart != 0: - raise ValueError("non-zero start_iteration provided, but " - "dataset doesn't exist yet") - istart = 0 - istop = istart + niterations - fp.create_dataset(dataset_name, (ntemps, nwalkers, istop), - maxshape=(ntemps, nwalkers, max_iterations), - dtype=float, fletcher32=True) - fp[dataset_name][:, :, istart:istop] = samples[param] - - def write_results(self, fp, start_iteration=None, max_iterations=None, - **metadata): - """Writes metadata, samples, model stats, and acceptance fraction - to the given file. See the write function for each of those for - details. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. - \**metadata : - All other keyword arguments are passed to ``write_metadata``. - """ - self.write_metadata(fp, **metadata) - self.write_chain(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_model_stats(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_acceptance_fraction(fp) - self.write_state(fp) - - @staticmethod - def _read_fields(fp, fields_group, fields, array_class, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None, temps=None, walkers=None, flatten=True): - """Base function for reading samples and model stats. See - `read_samples` and `read_model_stats` for details. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - fields_group : str - The name of the group to retrieve the desired fields. - fields : list - The list of field names to retrieve. Must be names of groups in - `fp[fields_group/]`. - array_class : FieldArray or similar - The type of array to return. Must have a `from_kwargs` attribute. - - For other details on keyword arguments, see `read_samples` and - `read_model_stats`. - - Returns - ------- - array_class - An instance of the given array class populated with values - retrieved from the fields. - """ - # walkers to load - if walkers is not None: - widx = numpy.zeros(fp.nwalkers, dtype=bool) - widx[walkers] = True - nwalkers = widx.sum() - else: - widx = slice(None, None) - nwalkers = fp.nwalkers - # temperatures to load - selecttemps = False - if temps is None: - tidx = 0 - ntemps = 1 - elif isinstance(temps, int): - tidx = temps - ntemps = 1 - else: - # temps is either 'all' or a list of temperatures; - # in either case, we'll get all of the temperatures from the file; - # if not 'all', then we'll pull out the ones we want - tidx = slice(None, None) - selecttemps = temps != 'all' - if selecttemps: - ntemps = len(temps) - else: - ntemps = fp.ntemps - # get the slice to use - if iteration is not None: - get_index = iteration - niterations = 1 - else: - if thin_end is None: - # use the number of current iterations - thin_end = fp.niterations - get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end, - thin_interval=thin_interval) - # we'll just get the number of iterations from the returned shape - niterations = None - # load - arrays = {} - group = fields_group + '/{name}' - for name in fields: - arr = fp[group.format(name=name)][tidx, widx, get_index] - if niterations is None: - niterations = arr.shape[-1] - # pull out the temperatures we need - if selecttemps: - arr = arr[temps, ...] - if flatten: - arr = arr.flatten() - else: - # ensure that the returned array is 3D - arr = arr.reshape((ntemps, nwalkers, niterations)) - arrays[name] = arr - return array_class.from_kwargs(**arrays) - - @classmethod - def read_samples(cls, fp, parameters, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None, temps=0, walkers=None, flatten=True, - samples_group=None, array_class=None): - """Reads samples for the given parameter(s). - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - parameters : (list of) strings - The parameter(s) to retrieve. A parameter can be the name of any - field in `fp[fp.samples_group]`, a virtual field or method of - `FieldArray` (as long as the file contains the necessary fields - to derive the virtual field or method), and/or a function of - these. - thin_start : int - Index of the sample to begin returning samples. Default is to read - samples after burn in. To start from the beginning set thin_start - to 0. - thin_interval : int - Interval to accept every i-th sample. Default is to use the - `fp.acl`. If `fp.acl` is not set, then use all samples - (set thin_interval to 1). - thin_end : int - Index of the last sample to read. If not given then - `fp.niterations` is used. - iteration : int - Get a single iteration. If provided, will override the - `thin_{start/interval/end}` arguments. - walkers : {None, (list of) int} - The walker index (or a list of indices) to retrieve. If None, - samples from all walkers will be obtained. - temps : {None, (list of) int, 'all'} - The temperature index (or list of indices) to retrieve. If None, - only samples from the coldest (= 0) temperature chain will be - retrieved. To retrieve all temperates pass 'all', or a list of - all of the temperatures. - flatten : {True, bool} - The returned array will be one dimensional, with all desired - samples from all desired walkers concatenated together. If False, - the returned array will have dimension requested temps x requested - walkers x requested iterations. - samples_group : {None, str} - The group in `fp` from which to retrieve the parameter fields. If - None, searches in `fp.samples_group`. - array_class : {None, array class} - The type of array to return. The class must have a `from_kwargs` - class method and a `parse_parameters` method. If None, will return - a FieldArray. - - Returns - ------- - array_class - Samples for the given parameters, as an instance of a the given - `array_class` (`FieldArray` if `array_class` is None). - """ - # get the group to load from - if samples_group is None: - samples_group = fp.samples_group - # get the type of array class to use - if array_class is None: - array_class = FieldArray - # get the names of fields needed for the given parameters - possible_fields = fp[samples_group].keys() - loadfields = array_class.parse_parameters(parameters, possible_fields) - return cls._read_fields( - fp, samples_group, loadfields, array_class, - thin_start=thin_start, thin_interval=thin_interval, - thin_end=thin_end, iteration=iteration, temps=temps, - walkers=walkers, flatten=flatten) - - @classmethod - def compute_acfs(cls, fp, start_index=None, end_index=None, - per_walker=False, walkers=None, parameters=None, - temps=None): - """Computes the autocorrleation function of the model params in the - given file. - - By default, parameter values are averaged over all walkers at each - iteration. The ACF is then calculated over the averaged chain for each - temperature. An ACF per-walker will be returned instead if - ``per_walker=True``. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - start_index : {None, int} - The start index to compute the acl from. If None, will try to use - the number of burn-in iterations in the file; otherwise, will start - at the first sample. - end_index : {None, int} - The end index to compute the acl to. If None, will go to the end - of the current iteration. - per_walker : optional, bool - Return the ACF for each walker separately. Default is False. - walkers : optional, int or array - Calculate the ACF using only the given walkers. If None (the - default) all walkers will be used. - parameters : optional, str or array - Calculate the ACF for only the given parameters. If None (the - default) will calculate the ACF for all of the model params. - temps : optional, (list of) int or 'all' - The temperature index (or list of indices) to retrieve. If None - (the default), the ACF will only be computed for the coldest (= 0) - temperature chain. To compute an ACF for all temperates pass 'all', - or a list of all of the temperatures. - - Returns - ------- - FieldArray - A ``FieldArray`` of the ACF vs iteration for each parameter. If - `per-walker` is True, the FieldArray will have shape - ``ntemps x nwalkers x niterations``. Otherwise, the returned - array will have shape ``ntemps x niterations``. - """ - acfs = {} - if parameters is None: - parameters = fp.variable_params - if isinstance(parameters, str) or isinstance(parameters, unicode): - parameters = [parameters] - if isinstance(temps, int): - temps = [temps] - elif temps == 'all': - temps = numpy.arange(fp.ntemps) - elif temps is None: - temps = [0] - for param in parameters: - subacfs = [] - for tk in temps: - if per_walker: - # just call myself with a single walker - if walkers is None: - walkers = numpy.arange(fp.nwalkers) - arrays = [cls.compute_acfs(fp, start_index=start_index, - end_index=end_index, - per_walker=False, walkers=ii, - parameters=param, - temps=tk)[param][0, :] - for ii in walkers] - # we'll stack all of the walker arrays to make a single - # nwalkers x niterations array; when these are stacked - # below, we'll get a ntemps x nwalkers x niterations array - subacfs.append(numpy.vstack(arrays)) - else: - samples = cls.read_samples(fp, param, - thin_start=start_index, - thin_interval=1, - thin_end=end_index, - walkers=walkers, temps=tk, - flatten=False)[param] - # contract the walker dimension using the mean, and flatten - # the (length 1) temp dimension - samples = samples.mean(axis=1)[0, :] - thisacf = autocorrelation.calculate_acf(samples).numpy() - subacfs.append(thisacf) - # stack the temperatures - # FIXME: the following if/else can be condensed to a single line - # using numpy.stack, once the version requirements are bumped to - # numpy >= 1.10 - if per_walker: - nw, ni = subacfs[0].shape - acfs[param] = numpy.zeros((len(temps), nw, ni), dtype=float) - for tk in range(len(temps)): - acfs[param][tk, ...] = subacfs[tk] - else: - acfs[param] = numpy.vstack(subacfs) - return FieldArray.from_kwargs(**acfs) - - @classmethod - def compute_acls(cls, fp, start_index=None, end_index=None): - """Computes the autocorrleation length for all model params and - temperatures in the given file. - - Parameter values are averaged over all walkers at each iteration and - temperature. The ACL is then calculated over the averaged chain. If - the returned ACL is `inf`, will default to the number of current - iterations. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - start_index : {None, int} - The start index to compute the acl from. If None, will try to use - the number of burn-in iterations in the file; otherwise, will start - at the first sample. - end_index : {None, int} - The end index to compute the acl to. If None, will go to the end - of the current iteration. - - Returns - ------- - dict - A dictionary of ntemps-long arrays of the ACLs of each parameter. - """ - acls = {} - if end_index is None: - end_index = fp.niterations - tidx = numpy.arange(fp.ntemps) - for param in fp.variable_params: - these_acls = numpy.zeros(fp.ntemps, dtype=int) - for tk in tidx: - samples = cls.read_samples(fp, param, thin_start=start_index, - thin_interval=1, thin_end=end_index, - temps=tk, flatten=False)[param] - # contract the walker dimension using the mean, and flatten - # the (length 1) temp dimension - samples = samples.mean(axis=1)[0, :] - acl = autocorrelation.calculate_acl(samples) - if numpy.isinf(acl): - acl = samples.size - these_acls[tk] = acl - acls[param] = these_acls - return acls - - @classmethod - def calculate_logevidence(cls, fp, thin_start=None, thin_end=None, - thin_interval=None): - """Calculates the log evidence from the given file using emcee's - thermodynamic integration. - - Parameters - ---------- - fp : InferenceFile - An open file handler to read the stats from. - thin_start : int - Index of the sample to begin returning stats. Default is to read - stats after burn in. To start from the beginning set thin_start - to 0. - thin_interval : int - Interval to accept every i-th sample. Default is to use the - `fp.acl`. If `fp.acl` is not set, then use all stats - (set thin_interval to 1). - thin_end : int - Index of the last sample to read. If not given then - `fp.niterations` is used. - - Returns - ------- - lnZ : float - The estimate of log of the evidence. - dlnZ : float - The error on the estimate. - """ - try: - import emcee - except ImportError: - raise ImportError("emcee is not installed.") - - stats_group = fp.stats_group - parameters = fp[stats_group].keys() - logstats = cls.read_samples(fp, parameters, samples_group=stats_group, - thin_start=thin_start, thin_end=thin_end, - thin_interval=thin_interval, - temps='all', flatten=False) - # get the likelihoods - logls = logstats['loglr'] + fp.lognl - # we need the betas that were used - betas = fp.attrs['betas'] - # annoyingly, theromdynaimc integration in PTSampler is an instance - # method, so we'll implement a dummy one - ntemps = fp.ntemps - nwalkers = fp.nwalkers - ndim = len(fp.variable_params) - dummy_sampler = emcee.PTSampler(ntemps, nwalkers, ndim, None, - None, betas=betas) - return dummy_sampler.thermodynamic_integration_log_evidence( - logls=logls, fburnin=0.) From f2b04f3d896da4b9d69213ad4b1268d23a449dc2 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Fri, 13 Jul 2018 16:42:13 +0200 Subject: [PATCH 12/47] move emcee_pt to it's own module --- gwin/sampler/emcee_pt.py | 754 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 754 insertions(+) create mode 100644 gwin/sampler/emcee_pt.py diff --git a/gwin/sampler/emcee_pt.py b/gwin/sampler/emcee_pt.py new file mode 100644 index 0000000..8cb6605 --- /dev/null +++ b/gwin/sampler/emcee_pt.py @@ -0,0 +1,754 @@ +# Copyright (C) 2016 Collin Capano +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +""" +This modules provides classes and functions for using the emcee sampler +packages for parameter estimation. +""" + +# This is needed for two reason +# 1) pools freeze state when created and so classes *cannot be updated* +# 2) methods cannot be pickled. +class _callprior(object): + """Calls the model's prior function, and ensures that no + metadata is returned.""" + def __init__(self, model_call): + self.callable = model_call + + def __call__(self, args): + prior = self.callable(args, callfunc='prior') + return prior if isinstance(prior, numpy.float64) else prior[0] + + +class _callloglikelihood(object): + """Calls the model's loglikelihood function. + """ + def __init__(self, model_call): + self.callable = model_call + + def __call__(self, args): + return self.callable(args, callfunc='loglikelihood') + + +class EmceePTSampler(BaseMCMCSampler): + """This class is used to construct a parallel-tempered MCMC sampler from + the emcee package's PTSampler. + + Parameters + ---------- + model : model + A model from ``gwin.models``. + ntemps : int + Number of temeratures to use in the sampler. + nwalkers : int + Number of walkers to use in sampler. + pool : function with map, Optional + A provider of a map function that allows a function call to be run + over multiple sets of arguments and possibly maps them to + cores/nodes/etc. + """ + name = "emcee_pt" + + def __init__(self, model, ntemps, nwalkers, pool=None, + model_call=None): + + try: + import emcee + except ImportError: + raise ImportError("emcee is not installed.") + + if model_call is None: + model_call = model + + # construct the sampler: PTSampler needs the likelihood and prior + # functions separately + ndim = len(model.variable_params) + sampler = emcee.PTSampler(ntemps, nwalkers, ndim, + _callloglikelihood(model_call), + _callprior(model_call), + pool=pool) + # initialize + super(EmceePTSampler, self).__init__( + sampler, model) + self._nwalkers = nwalkers + self._ntemps = ntemps + + @classmethod + def from_cli(cls, opts, model, pool=None, + model_call=None): + """Create an instance of this sampler from the given command-line + options. + + Parameters + ---------- + opts : ArgumentParser options + The options to parse. + model : LikelihoodEvaluator + The model to use with the sampler. + + Returns + ------- + EmceePTSampler + An emcee sampler initialized based on the given arguments. + """ + return cls(model, opts.ntemps, opts.nwalkers, + pool=pool, model_call=model_call) + + @property + def ntemps(self): + return self._ntemps + + @property + def chain(self): + """Get all past samples as an ntemps x nwalker x niterations x ndim + array. + """ + # emcee returns the chain as ntemps x nwalker x niterations x ndim + return self._sampler.chain + + def clear_chain(self): + """Clears the chain and blobs from memory. + """ + # store the iteration that the clear is occuring on + self.lastclear = self.niterations + # now clear the chain + self._sampler.reset() + + @property + def model_stats(self): + """Returns the log likelihood ratio and log prior as a FieldArray. + The returned array has shape ntemps x nwalkers x niterations. + """ + # likelihood has shape ntemps x nwalkers x niterations + logl = self._sampler.lnlikelihood + # get prior from posterior + logp = self._sampler.lnprobability - logl + # compute the likelihood ratio + loglr = logl - self.model.lognl + kwargs = {'loglr': loglr, 'prior': logp} + # if different coordinates were used for sampling, get the jacobian + if self.model.sampling_transforms is not None: + samples = self.samples + # convert to dict + d = {param: samples[param] for param in samples.fieldnames} + logj = self.model.logjacobian(**d) + kwargs['logjacobian'] = logj + return FieldArray.from_kwargs(**kwargs) + + @property + def lnpost(self): + """Get the natural logarithm of the likelihood + the prior as an + ntemps x nwalkers x niterations array. + """ + # emcee returns ntemps x nwalkers x niterations + return self._sampler.lnprobability + + def set_p0(self, samples_file=None, prior=None): + """Sets the initial position of the walkers. + + Parameters + ---------- + samples_file : InferenceFile, optional + If provided, use the last iteration in the given file for the + starting positions. + prior : JointDistribution, optional + Use the given prior to set the initial positions rather than + ``model``'s prior. + + Returns + ------- + p0 : array + An ntemps x nwalkers x ndim array of the initial positions that + were set. + """ + # create a (nwalker, ndim) array for initial positions + ntemps = self.ntemps + nwalkers = self.nwalkers + ndim = len(self.variable_params) + p0 = numpy.ones((ntemps, nwalkers, ndim)) + # if samples are given then use those as initial positions + if samples_file is not None: + samples = self.read_samples(samples_file, self.variable_params, + iteration=-1, temps='all', + flatten=False)[..., 0] + # transform to sampling parameter space + samples = self.model.apply_sampling_transforms( + samples) + # draw random samples if samples are not provided + else: + samples = self.model.prior_rvs( + size=nwalkers*ntemps, prior=prior).reshape((ntemps, nwalkers)) + # convert to array + for i, param in enumerate(self.sampling_params): + p0[..., i] = samples[param] + self._p0 = p0 + return p0 + + def run(self, niterations, **kwargs): + """Advance the ensemble for a number of samples. + + Parameters + ---------- + niterations : int + Number of samples to get from sampler. + + Returns + ------- + p : numpy.array + An array of current walker positions with shape (nwalkers, ndim). + lnpost : numpy.array + The list of log posterior probabilities for the walkers at + positions p, with shape (nwalkers, ndim). + rstate : + The current state of the random number generator. + """ + pos = self._pos + if pos is None: + pos = self.p0 + res = self._sampler.run_mcmc(pos, niterations, **kwargs) + p, lnpost, rstate = res[0], res[1], res[2] + # update the positions + self._pos = p + return p, lnpost, rstate + + # read/write functions + + # add ntemps and betas to metadata + def write_metadata(self, fp, **kwargs): + """Writes metadata about this sampler to the given file. Metadata is + written to the file's `attrs`. + + Parameters + ---------- + fp : InferenceFile + A file handler to an open inference file. + **kwargs : + All keyword arguments are saved as separate arguments in the + file attrs. If any keyword argument is a dictionary, the keyword + will point to the list of keys in the the file's ``attrs``. Each + key is then stored as a separate attr with its corresponding value. + """ + super(EmceePTSampler, self).write_metadata(fp, **kwargs) + fp.attrs["ntemps"] = self.ntemps + fp.attrs["betas"] = self._sampler.betas + + def write_acceptance_fraction(self, fp): + """Write acceptance_fraction data to file. Results are written to + `fp[acceptance_fraction/temp{k}]` where k is the temperature. + + Parameters + ----------- + fp : InferenceFile + A file handler to an open inference file. + """ + group = "acceptance_fraction/temp{tk}" + # acf has shape ntemps x nwalkers + acf = self.acceptance_fraction + for tk in range(fp.ntemps): + try: + fp[group.format(tk=tk)][:] = acf[tk, :] + except KeyError: + # dataset doesn't exist yet, create it + fp[group.format(tk=tk)] = acf[tk, :] + + @staticmethod + def read_acceptance_fraction(fp, temps=None, walkers=None): + """Reads the acceptance fraction from the given file. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + temps : {None, (list of) int} + The temperature index (or a list of indices) to retrieve. If None, + acfs from all temperatures and all walkers will be retrieved. + walkers : {None, (list of) int} + The walker index (or a list of indices) to retrieve. If None, + samples from all walkers will be obtained. + + Returns + ------- + array + Array of acceptance fractions with shape (requested temps, + requested walkers). + """ + group = 'acceptance_fraction/temp{tk}' + if temps is None: + temps = numpy.arange(fp.ntemps) + if walkers is None: + wmask = numpy.ones(fp.nwalkers, dtype=bool) + else: + wmask = numpy.zeros(fp.nwalkers, dtype=bool) + wmask[walkers] = True + arrays = [] + for tk in temps: + arrays.extend(fp[group.format(tk=tk)][wmask]) + return arrays + + @staticmethod + def write_samples_group(fp, samples_group, parameters, samples, + start_iteration=None, max_iterations=None): + """Writes samples to the given file. + + Results are written to: + + ``fp[samples_group/{vararg}]``, + + where ``{vararg}`` is the name of a variable arg. The samples are + written as an ``ntemps x nwalkers x niterations`` array. + + Parameters + ----------- + fp : InferenceFile + A file handler to an open inference file. + samples_group : str + Name of samples group to write. + parameters : list + The parameters to write to the file. + samples : FieldArray + The samples to write. Should be a FieldArray with fields containing + the samples to write and shape nwalkers x niterations. + start_iteration : int, optional + Write results to the file's datasets starting at the given + iteration. Default is to append after the last iteration in the + file. + max_iterations : int, optional + Set the maximum size that the arrays in the hdf file may be resized + to. Only applies if the samples have not previously been written + to file. The default (None) is to use the maximum size allowed by + h5py. + """ + ntemps, nwalkers, niterations = samples.shape + if max_iterations is not None and max_iterations < niterations: + raise IndexError("The provided max size is less than the " + "number of iterations") + group = samples_group + '/{name}' + # loop over number of dimensions + for param in parameters: + dataset_name = group.format(name=param) + istart = start_iteration + try: + fp_niterations = fp[dataset_name].shape[-1] + if istart is None: + istart = fp_niterations + istop = istart + niterations + if istop > fp_niterations: + # resize the dataset + fp[dataset_name].resize(istop, axis=2) + except KeyError: + # dataset doesn't exist yet + if istart is not None and istart != 0: + raise ValueError("non-zero start_iteration provided, but " + "dataset doesn't exist yet") + istart = 0 + istop = istart + niterations + fp.create_dataset(dataset_name, (ntemps, nwalkers, istop), + maxshape=(ntemps, nwalkers, max_iterations), + dtype=float, fletcher32=True) + fp[dataset_name][:, :, istart:istop] = samples[param] + + def write_results(self, fp, start_iteration=None, max_iterations=None, + **metadata): + """Writes metadata, samples, model stats, and acceptance fraction + to the given file. See the write function for each of those for + details. + + Parameters + ----------- + fp : InferenceFile + A file handler to an open inference file. + start_iteration : int, optional + Write results to the file's datasets starting at the given + iteration. Default is to append after the last iteration in the + file. + max_iterations : int, optional + Set the maximum size that the arrays in the hdf file may be resized + to. Only applies if the samples have not previously been written + to file. The default (None) is to use the maximum size allowed by + h5py. + \**metadata : + All other keyword arguments are passed to ``write_metadata``. + """ + self.write_metadata(fp, **metadata) + self.write_chain(fp, start_iteration=start_iteration, + max_iterations=max_iterations) + self.write_model_stats(fp, start_iteration=start_iteration, + max_iterations=max_iterations) + self.write_acceptance_fraction(fp) + self.write_state(fp) + + @staticmethod + def _read_fields(fp, fields_group, fields, array_class, + thin_start=None, thin_interval=None, thin_end=None, + iteration=None, temps=None, walkers=None, flatten=True): + """Base function for reading samples and model stats. See + `read_samples` and `read_model_stats` for details. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + fields_group : str + The name of the group to retrieve the desired fields. + fields : list + The list of field names to retrieve. Must be names of groups in + `fp[fields_group/]`. + array_class : FieldArray or similar + The type of array to return. Must have a `from_kwargs` attribute. + + For other details on keyword arguments, see `read_samples` and + `read_model_stats`. + + Returns + ------- + array_class + An instance of the given array class populated with values + retrieved from the fields. + """ + # walkers to load + if walkers is not None: + widx = numpy.zeros(fp.nwalkers, dtype=bool) + widx[walkers] = True + nwalkers = widx.sum() + else: + widx = slice(None, None) + nwalkers = fp.nwalkers + # temperatures to load + selecttemps = False + if temps is None: + tidx = 0 + ntemps = 1 + elif isinstance(temps, int): + tidx = temps + ntemps = 1 + else: + # temps is either 'all' or a list of temperatures; + # in either case, we'll get all of the temperatures from the file; + # if not 'all', then we'll pull out the ones we want + tidx = slice(None, None) + selecttemps = temps != 'all' + if selecttemps: + ntemps = len(temps) + else: + ntemps = fp.ntemps + # get the slice to use + if iteration is not None: + get_index = iteration + niterations = 1 + else: + if thin_end is None: + # use the number of current iterations + thin_end = fp.niterations + get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end, + thin_interval=thin_interval) + # we'll just get the number of iterations from the returned shape + niterations = None + # load + arrays = {} + group = fields_group + '/{name}' + for name in fields: + arr = fp[group.format(name=name)][tidx, widx, get_index] + if niterations is None: + niterations = arr.shape[-1] + # pull out the temperatures we need + if selecttemps: + arr = arr[temps, ...] + if flatten: + arr = arr.flatten() + else: + # ensure that the returned array is 3D + arr = arr.reshape((ntemps, nwalkers, niterations)) + arrays[name] = arr + return array_class.from_kwargs(**arrays) + + @classmethod + def read_samples(cls, fp, parameters, + thin_start=None, thin_interval=None, thin_end=None, + iteration=None, temps=0, walkers=None, flatten=True, + samples_group=None, array_class=None): + """Reads samples for the given parameter(s). + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + parameters : (list of) strings + The parameter(s) to retrieve. A parameter can be the name of any + field in `fp[fp.samples_group]`, a virtual field or method of + `FieldArray` (as long as the file contains the necessary fields + to derive the virtual field or method), and/or a function of + these. + thin_start : int + Index of the sample to begin returning samples. Default is to read + samples after burn in. To start from the beginning set thin_start + to 0. + thin_interval : int + Interval to accept every i-th sample. Default is to use the + `fp.acl`. If `fp.acl` is not set, then use all samples + (set thin_interval to 1). + thin_end : int + Index of the last sample to read. If not given then + `fp.niterations` is used. + iteration : int + Get a single iteration. If provided, will override the + `thin_{start/interval/end}` arguments. + walkers : {None, (list of) int} + The walker index (or a list of indices) to retrieve. If None, + samples from all walkers will be obtained. + temps : {None, (list of) int, 'all'} + The temperature index (or list of indices) to retrieve. If None, + only samples from the coldest (= 0) temperature chain will be + retrieved. To retrieve all temperates pass 'all', or a list of + all of the temperatures. + flatten : {True, bool} + The returned array will be one dimensional, with all desired + samples from all desired walkers concatenated together. If False, + the returned array will have dimension requested temps x requested + walkers x requested iterations. + samples_group : {None, str} + The group in `fp` from which to retrieve the parameter fields. If + None, searches in `fp.samples_group`. + array_class : {None, array class} + The type of array to return. The class must have a `from_kwargs` + class method and a `parse_parameters` method. If None, will return + a FieldArray. + + Returns + ------- + array_class + Samples for the given parameters, as an instance of a the given + `array_class` (`FieldArray` if `array_class` is None). + """ + # get the group to load from + if samples_group is None: + samples_group = fp.samples_group + # get the type of array class to use + if array_class is None: + array_class = FieldArray + # get the names of fields needed for the given parameters + possible_fields = fp[samples_group].keys() + loadfields = array_class.parse_parameters(parameters, possible_fields) + return cls._read_fields( + fp, samples_group, loadfields, array_class, + thin_start=thin_start, thin_interval=thin_interval, + thin_end=thin_end, iteration=iteration, temps=temps, + walkers=walkers, flatten=flatten) + + @classmethod + def compute_acfs(cls, fp, start_index=None, end_index=None, + per_walker=False, walkers=None, parameters=None, + temps=None): + """Computes the autocorrleation function of the model params in the + given file. + + By default, parameter values are averaged over all walkers at each + iteration. The ACF is then calculated over the averaged chain for each + temperature. An ACF per-walker will be returned instead if + ``per_walker=True``. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + start_index : {None, int} + The start index to compute the acl from. If None, will try to use + the number of burn-in iterations in the file; otherwise, will start + at the first sample. + end_index : {None, int} + The end index to compute the acl to. If None, will go to the end + of the current iteration. + per_walker : optional, bool + Return the ACF for each walker separately. Default is False. + walkers : optional, int or array + Calculate the ACF using only the given walkers. If None (the + default) all walkers will be used. + parameters : optional, str or array + Calculate the ACF for only the given parameters. If None (the + default) will calculate the ACF for all of the model params. + temps : optional, (list of) int or 'all' + The temperature index (or list of indices) to retrieve. If None + (the default), the ACF will only be computed for the coldest (= 0) + temperature chain. To compute an ACF for all temperates pass 'all', + or a list of all of the temperatures. + + Returns + ------- + FieldArray + A ``FieldArray`` of the ACF vs iteration for each parameter. If + `per-walker` is True, the FieldArray will have shape + ``ntemps x nwalkers x niterations``. Otherwise, the returned + array will have shape ``ntemps x niterations``. + """ + acfs = {} + if parameters is None: + parameters = fp.variable_params + if isinstance(parameters, str) or isinstance(parameters, unicode): + parameters = [parameters] + if isinstance(temps, int): + temps = [temps] + elif temps == 'all': + temps = numpy.arange(fp.ntemps) + elif temps is None: + temps = [0] + for param in parameters: + subacfs = [] + for tk in temps: + if per_walker: + # just call myself with a single walker + if walkers is None: + walkers = numpy.arange(fp.nwalkers) + arrays = [cls.compute_acfs(fp, start_index=start_index, + end_index=end_index, + per_walker=False, walkers=ii, + parameters=param, + temps=tk)[param][0, :] + for ii in walkers] + # we'll stack all of the walker arrays to make a single + # nwalkers x niterations array; when these are stacked + # below, we'll get a ntemps x nwalkers x niterations array + subacfs.append(numpy.vstack(arrays)) + else: + samples = cls.read_samples(fp, param, + thin_start=start_index, + thin_interval=1, + thin_end=end_index, + walkers=walkers, temps=tk, + flatten=False)[param] + # contract the walker dimension using the mean, and flatten + # the (length 1) temp dimension + samples = samples.mean(axis=1)[0, :] + thisacf = autocorrelation.calculate_acf(samples).numpy() + subacfs.append(thisacf) + # stack the temperatures + # FIXME: the following if/else can be condensed to a single line + # using numpy.stack, once the version requirements are bumped to + # numpy >= 1.10 + if per_walker: + nw, ni = subacfs[0].shape + acfs[param] = numpy.zeros((len(temps), nw, ni), dtype=float) + for tk in range(len(temps)): + acfs[param][tk, ...] = subacfs[tk] + else: + acfs[param] = numpy.vstack(subacfs) + return FieldArray.from_kwargs(**acfs) + + @classmethod + def compute_acls(cls, fp, start_index=None, end_index=None): + """Computes the autocorrleation length for all model params and + temperatures in the given file. + + Parameter values are averaged over all walkers at each iteration and + temperature. The ACL is then calculated over the averaged chain. If + the returned ACL is `inf`, will default to the number of current + iterations. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + start_index : {None, int} + The start index to compute the acl from. If None, will try to use + the number of burn-in iterations in the file; otherwise, will start + at the first sample. + end_index : {None, int} + The end index to compute the acl to. If None, will go to the end + of the current iteration. + + Returns + ------- + dict + A dictionary of ntemps-long arrays of the ACLs of each parameter. + """ + acls = {} + if end_index is None: + end_index = fp.niterations + tidx = numpy.arange(fp.ntemps) + for param in fp.variable_params: + these_acls = numpy.zeros(fp.ntemps, dtype=int) + for tk in tidx: + samples = cls.read_samples(fp, param, thin_start=start_index, + thin_interval=1, thin_end=end_index, + temps=tk, flatten=False)[param] + # contract the walker dimension using the mean, and flatten + # the (length 1) temp dimension + samples = samples.mean(axis=1)[0, :] + acl = autocorrelation.calculate_acl(samples) + if numpy.isinf(acl): + acl = samples.size + these_acls[tk] = acl + acls[param] = these_acls + return acls + + @classmethod + def calculate_logevidence(cls, fp, thin_start=None, thin_end=None, + thin_interval=None): + """Calculates the log evidence from the given file using emcee's + thermodynamic integration. + + Parameters + ---------- + fp : InferenceFile + An open file handler to read the stats from. + thin_start : int + Index of the sample to begin returning stats. Default is to read + stats after burn in. To start from the beginning set thin_start + to 0. + thin_interval : int + Interval to accept every i-th sample. Default is to use the + `fp.acl`. If `fp.acl` is not set, then use all stats + (set thin_interval to 1). + thin_end : int + Index of the last sample to read. If not given then + `fp.niterations` is used. + + Returns + ------- + lnZ : float + The estimate of log of the evidence. + dlnZ : float + The error on the estimate. + """ + try: + import emcee + except ImportError: + raise ImportError("emcee is not installed.") + + stats_group = fp.stats_group + parameters = fp[stats_group].keys() + logstats = cls.read_samples(fp, parameters, samples_group=stats_group, + thin_start=thin_start, thin_end=thin_end, + thin_interval=thin_interval, + temps='all', flatten=False) + # get the likelihoods + logls = logstats['loglr'] + fp.lognl + # we need the betas that were used + betas = fp.attrs['betas'] + # annoyingly, theromdynaimc integration in PTSampler is an instance + # method, so we'll implement a dummy one + ntemps = fp.ntemps + nwalkers = fp.nwalkers + ndim = len(fp.variable_params) + dummy_sampler = emcee.PTSampler(ntemps, nwalkers, ndim, None, + None, betas=betas) + return dummy_sampler.thermodynamic_integration_log_evidence( + logls=logls, fburnin=0.) From 5f9c0915b77d720d847c492fd69d8b8a6564353f Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Fri, 13 Jul 2018 16:44:43 +0200 Subject: [PATCH 13/47] add base_mcmc (needs work) --- gwin/sampler/base_mcmc.py | 344 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 gwin/sampler/base_mcmc.py diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py new file mode 100644 index 0000000..69beb75 --- /dev/null +++ b/gwin/sampler/base_mcmc.py @@ -0,0 +1,344 @@ +# Copyright (C) 2016 Christopher M. Biwer, Collin Capano +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +"""Provides constructor classes for MCMC samplers.""" + +from abc import ABCMeta, abstractmethod, abstractproperty + +class BaseMCMC(object): + """This class provides methods common to MCMCs. + + It is not a sampler class itself. Sampler classes can inherit from this + along with ``BaseSampler``. + + Attributes + ---------- + p0 : dict + A dictionary of the initial position of the walkers. Set by using + ``set_p0``. If not set yet, a ``ValueError`` is raised when the + attribute is accessed. + pos : dict + A dictionary of the current walker positions. If the sampler hasn't + been run yet, returns p0. + """ + __metaclass__ = ABCMeta + + lastclear = None + _itercounter = None + _pos = None + _p0 = None + _nwalkers = None + + @abstractproperty(self): + def samples_shape(self): + """Should define what shape to expect samples to be in.""" + pass + + @property + def nwalkers(self): + """Get the number of walkers.""" + if self._nwalkers is None: + raise ValueError("number of walkers not set") + return self._nwalkers + + @property + def niterations(self): + """Get the current number of iterations.""" + itercounter = self._itercounter + if _itercounter is None: + itercounter = 0 + lastclear = self.lastclear + if lastclear is None: + lastclear = 0 + return itercounter + lastclear + + @property + def pos(self): + pos = self._pos + if pos is None: + return self.p0 + # convert to dict + pos = {param: self._pos[..., k] + for (k, param) in enumerate(self.sampling_params)} + return pos + + @property + def p0(self): + """The starting position of the walkers in the sampling param space. + + The returned object is a dict mapping the sampling parameters to the + values. + """ + if self._p0 is None: + raise ValueError("initial positions not set; run set_p0") + # convert to dict + p0 = {param: self._p0[..., k] + for (k, param) in enumerate(self.sampling_params)} + return p0 + + def set_p0(self, samples_file=None, prior=None): + """Sets the initial position of the walkers. + + Parameters + ---------- + samples_file : InferenceFile, optional + If provided, use the last iteration in the given file for the + starting positions. + prior : JointDistribution, optional + Use the given prior to set the initial positions rather than + ``model``'s prior. + + Returns + ------- + p0 : dict + A dictionary maping sampling params to the starting positions. + """ + # if samples are given then use those as initial positions + if samples_file is not None: + with self.io(samples_file, 'r') as fp: + samples = fp.read_samples(self.variable_params, + iteration=-1) + # make sure we have the same shape + assert(samples.shape == self.samples_shape, + "samples in file {} have shape {}, but I have shape {}". + format(samples_file, samples.shape, self.samples_shape)) + # transform to sampling parameter space + samples = self.model.apply_sampling_transforms(samples) + # draw random samples if samples are not provided + else: + nsamples = numpy.prod(self.samples_shape) + samples = self.model.prior_rvs(size=nsamples, prior=prior).reshape( + self.samples_shape) + # store as ND array with shape [samples_shape] x nparams + ndim = len(self.variable_params) + p0 = numpy.ones(list(self.samples_shape)+[ndim]) + for i, param in enumerate(self.sampling_params): + p0[..., i] = samples[param] + self._p0 = p0 + return self.p0 + + @classmethod + def n_independent_samples(cls, fp): + """Returns the number of independent samples stored in a file. + + The number of independent samples are counted starting from after + burn-in. If the sampler hasn't burned in yet, then 0 is returned. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read. + + Returns + ------- + int + The number of independent samples. + """ + # check if burned in + if not fp.is_burned_in: + return 0 + # we'll just read a single parameter from the file + samples = cls.read_samples(fp, fp.variable_params[0]) + return samples.size + + @classmethod + def compute_acfs(cls, fp, start_index=None, end_index=None, + per_walker=False, walkers=None, parameters=None): + """Computes the autocorrleation function of the model params in the + given file. + + By default, parameter values are averaged over all walkers at each + iteration. The ACF is then calculated over the averaged chain. An + ACF per-walker will be returned instead if ``per_walker=True``. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + start_index : {None, int} + The start index to compute the acl from. If None, will try to use + the number of burn-in iterations in the file; otherwise, will start + at the first sample. + end_index : {None, int} + The end index to compute the acl to. If None, will go to the end + of the current iteration. + per_walker : optional, bool + Return the ACF for each walker separately. Default is False. + walkers : optional, int or array + Calculate the ACF using only the given walkers. If None (the + default) all walkers will be used. + parameters : optional, str or array + Calculate the ACF for only the given parameters. If None (the + default) will calculate the ACF for all of the model params. + + Returns + ------- + FieldArray + A ``FieldArray`` of the ACF vs iteration for each parameter. If + `per-walker` is True, the FieldArray will have shape + ``nwalkers x niterations``. + """ + acfs = {} + if parameters is None: + parameters = fp.variable_params + if isinstance(parameters, str) or isinstance(parameters, unicode): + parameters = [parameters] + for param in parameters: + if per_walker: + # just call myself with a single walker + if walkers is None: + walkers = numpy.arange(fp.nwalkers) + arrays = [cls.compute_acfs(fp, start_index=start_index, + end_index=end_index, + per_walker=False, walkers=ii, + parameters=param)[param] + for ii in walkers] + acfs[param] = numpy.vstack(arrays) + else: + samples = cls.read_samples(fp, param, + thin_start=start_index, + thin_interval=1, thin_end=end_index, + walkers=walkers, + flatten=False)[param] + samples = samples.mean(axis=0) + acfs[param] = autocorrelation.calculate_acf(samples).numpy() + return FieldArray.from_kwargs(**acfs) + + @classmethod + def compute_acls(cls, fp, start_index=None, end_index=None): + """Computes the autocorrleation length for all model params in the + given file. + + Parameter values are averaged over all walkers at each iteration. + The ACL is then calculated over the averaged chain. If the returned ACL + is `inf`, will default to the number of current iterations. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + start_index : {None, int} + The start index to compute the acl from. If None, will try to use + the number of burn-in iterations in the file; otherwise, will start + at the first sample. + end_index : {None, int} + The end index to compute the acl to. If None, will go to the end + of the current iteration. + + Returns + ------- + dict + A dictionary giving the ACL for each parameter. + """ + acls = {} + for param in fp.variable_params: + samples = cls.read_samples(fp, param, + thin_start=start_index, + thin_interval=1, thin_end=end_index, + flatten=False)[param] + samples = samples.mean(axis=0) + acl = autocorrelation.calculate_acl(samples) + if numpy.isinf(acl): + acl = samples.size + acls[param] = acl + return acls + + @staticmethod + def write_acls(fp, acls): + """Writes the given autocorrelation lengths to the given file. + + The ACL of each parameter is saved to ``fp['acls/{param}']``. + The maximum over all the parameters is saved to the file's 'acl' + attribute. + + Parameters + ---------- + fp : InferenceFile + An open file handler to write the samples to. + acls : dict + A dictionary of ACLs keyed by the parameter. + + Returns + ------- + ACL + The maximum of the acls that was written to the file. + """ + group = 'acls/{}' + # write the individual acls + for param in acls: + try: + # we need to use the write_direct function because it's + # apparently the only way to update scalars in h5py + fp[group.format(param)].write_direct(numpy.array(acls[param])) + except KeyError: + # dataset doesn't exist yet + fp[group.format(param)] = acls[param] + # write the maximum over all params + fp.attrs['acl'] = numpy.array(acls.values()).max() + return fp.attrs['acl'] + + @staticmethod + def read_acls(fp): + """Reads the acls of all the parameters in the given file. + + Parameters + ---------- + fp : InferenceFile + An open file handler to read the acls from. + + Returns + ------- + dict + A dictionary of the ACLs, keyed by the parameter name. + """ + group = fp['acls'] + return {param: group[param].value for param in group.keys()} + + +class MCMCBurnInSupport(object): + """Provides methods for estimating burn-in.""" + + def write_burn_in_iterations(fp, burn_in_iterations, is_burned_in=None): + """Writes the burn in iterations to the given file. + + Parameters + ---------- + fp : InferenceFile + A file handler to an open inference file. + burn_in_iterations : array + Array of values giving the iteration of the burn in of each walker. + is_burned_in : array + Array of booleans indicating which chains are burned in. + """ + try: + fp['burn_in_iterations'][:] = burn_in_iterations + except KeyError: + fp['burn_in_iterations'] = burn_in_iterations + fp.attrs['burn_in_iterations'] = burn_in_iterations.max() + if is_burned_in is not None: + try: + fp['is_burned_in'][:] = is_burned_in + except KeyError: + fp['is_burned_in'] = is_burned_in + fp.attrs['is_burned_in'] = is_burned_in.all() + From 3d75cab34acce61a9ba1635eee37895e4c7d8431 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Mon, 16 Jul 2018 14:26:01 -0400 Subject: [PATCH 14/47] add write_metadata to models --- gwin/io/base_hdf.py | 101 +++++++++++----------------------- gwin/models/base.py | 8 +++ gwin/models/base_data.py | 10 ++++ gwin/models/gaussian_noise.py | 18 ++++++ 4 files changed, 67 insertions(+), 70 deletions(-) diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index 6bd7864..413ced4 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -229,6 +229,7 @@ def cmd(self): cmd = cmd[-1] return cmd + def write_metadata(self, sampler, **kwargs): """Writes the sampler's metadata. @@ -243,24 +244,12 @@ def write_metadata(self, sampler, **kwargs): key is then stored as a separate attr with its corresponding value. """ self.attrs['sampler'] = samlper.name - self.attrs['model'] = sampler.model.name - self.attrs['variable_params'] = list(sampler.variable_params) - self.attrs['sampling_params'] = list(sampler.sampling_params) + # write the model's metadata + sampler.model.write_metadata(self) + write_kwargs_to_hdf_attrs(self.attrs, **kwargs) # FIXME: what will write this? #fp.attrs["lognl"] = self.model.lognl # add the static params to the kwargs - kwargs['static_params'] = sampler.static_params - for arg, val in kwargs.items(): - if val is None: - val = str(None) - if isinstance(val, dict): - self.attrs[arg] = val.keys() - for key, item in val.items(): - if item is None: - item = str(None) - self.attrs[key] = item - else: - self.attrs[arg] = val def write_logevidence(self, lnz, dlnz): """Writes the given log evidence and its error. @@ -336,11 +325,6 @@ def read_random_state(self, group=None): cached_gauss = self[dataset_name].attrs["cached_gauss"] return s, arr, pos, has_gauss, cached_gauss - def load_random_state(self): - """Sets numpy's random state using what is saved in the file. - """ - numpy.random.set_state(self.read_random_state()) - def write_strain(self, strain_dict, group=None): """Writes strain for each IFO to file. @@ -384,73 +368,25 @@ def write_stilde(self, stilde_dict, group=None): self[group.format(ifo=ifo)].attrs['delta_f'] = stilde.delta_f self[group.format(ifo=ifo)].attrs['epoch'] = float(stilde.epoch) - def write_psd(self, psds, low_frequency_cutoff, group=None): + def write_psd(self, psds, group=None): """Writes PSD for each IFO to file. Parameters ----------- psds : {dict, FrequencySeries} A dict of FrequencySeries where the key is the IFO. - low_frequency_cutoff : {dict, float} - A dict of the low-frequency cutoff where the key is the IFO. The - minimum value will be stored as an attr in the File. group : {None, str} - The group to write the strain to. If None, will write to the top - level. + The group to write the psd to. Default is ``data_group``. """ subgroup = self.data_group + "/{ifo}/psds/0" if group is None: group = subgroup else: group = '/'.join([group, subgroup]) - self.attrs["low_frequency_cutoff"] = min(low_frequency_cutoff.values()) for ifo in psds: self[group.format(ifo=ifo)] = psds[ifo] self[group.format(ifo=ifo)].attrs['delta_f'] = psds[ifo].delta_f - def write_data(self, strain_dict=None, stilde_dict=None, - psd_dict=None, low_frequency_cutoff_dict=None, - group=None): - """Writes the strain/stilde/psd. - - Parameters - ---------- - strain_dict : {None, dict} - A dictionary of strains. If None, no strain will be written. - stilde_dict : {None, dict} - A dictionary of stilde. If None, no stilde will be written. - psd_dict : {None, dict} - A dictionary of psds. If None, no psds will be written. - low_freuency_cutoff_dict : {None, dict} - A dictionary of low frequency cutoffs used for each detector in - `psd_dict`; must be provided if `psd_dict` is not None. - group : {None, str} - The group to write the strain to. If None, will write to the top - level. - """ - # save PSD - if psd_dict is not None: - if low_frequency_cutoff_dict is None: - raise ValueError("must provide low_frequency_cutoff_dict if " - "saving psds to output") - # apply dynamic range factor for saving PSDs since - # plotting code expects it - psd_dyn_dict = {} - for key, val in psd_dict.iteritems(): - psd_dyn_dict[key] = FrequencySeries(val*DYN_RANGE_FAC**2, - delta_f=val.delta_f) - self.write_psd(psds=psd_dyn_dict, - low_frequency_cutoff=low_frequency_cutoff_dict, - group=group) - - # save stilde - if stilde_dict is not None: - self.write_stilde(stilde_dict, group=group) - - # save strain if desired - if strain_dict is not None: - self.write_strain(strain_dict, group=group) - def write_injections(self, injection_file): """Writes injection parameters from the given injection file. @@ -668,6 +604,31 @@ def copy(self, other, ignore=None, parameters=None, parameter_names=None, #return other +def write_kwargs_to_hdf_attrs(attrs, **kwargs): + """Writes the given keywords to the given ``attrs``. + + If any keyword argument points to a dict, the keyword will point to a + list of the dict's keys. Each key is then written to the attrs with its + corresponding value. + + Parameters + ---------- + attrs : an HDF attrs + Can be either the ``attrs`` of the hdf file, or any group in a file. + \**kwargs : + The keywords to write. + """ + for arg, val in kwargs.items(): + if val is None: + val = str(None) + if isinstance(val, dict): + attrs[arg] = val.keys() + # just call self again with the dict as kwargs + write_kwargs_to_hdf_attrs(attrs, **val) + else: + attrs[arg] = val + + def check_integrity(filename): """Checks the integrity of an InferenceFile. diff --git a/gwin/models/base.py b/gwin/models/base.py index f4c4378..d5a3d5e 100644 --- a/gwin/models/base.py +++ b/gwin/models/base.py @@ -34,6 +34,7 @@ from pycbc.io import FieldArray from pycbc.workflow import ConfigParser +from gwin.io.base_hdf import write_kwargs_to_hdf_attrs # # ============================================================================= @@ -742,3 +743,10 @@ def from_config(cls, cp, **kwargs): args['sampling_transforms'] = sampling_transforms args.update(kwargs) return cls(**args) + + def write_metadata(self, fp): + """Writes metadata to the given file handler.""" + fp.attrs['model'] = sampler.model.name + fp.attrs['variable_params'] = list(self.variable_params) + fp.attrs['sampling_params'] = list(self.sampling_params) + write_kwargs_to_hdf_attrs(fp.attrs, static_params=self.static_params) diff --git a/gwin/models/base_data.py b/gwin/models/base_data.py index 0c2095e..b15327f 100644 --- a/gwin/models/base_data.py +++ b/gwin/models/base_data.py @@ -150,6 +150,11 @@ def data(self): """Returns the data that was set.""" return self._data + @property + def detectors(self): + """Returns the detectors used.""" + return self._data.keys() + def _transform_params(self, **params): """Adds waveform transforms to parent's ``_transform_params``.""" params = super(BaseDataModel, self)._transform_params(**params) @@ -231,3 +236,8 @@ def from_config(cls, cp, data, delta_f=None, delta_t=None, args['waveform_generator'] = waveform_generator return cls(**args) + + def write_metadata(self, fp): + """Adds data to the metadata that's written.""" + super(BaseDataModel, self).write_metadata(fp) + fp.write_stilde(self.data) diff --git a/gwin/models/gaussian_noise.py b/gwin/models/gaussian_noise.py index a2279de..c04dd4c 100644 --- a/gwin/models/gaussian_noise.py +++ b/gwin/models/gaussian_noise.py @@ -244,6 +244,7 @@ def __init__(self, variable_params, data, waveform_generator, d = data.values()[0] N = len(d) # figure out the kmin, kmax to use + self._f_lower = f_lower kmin, kmax = filter.get_cutoff_indices(f_lower, f_upper, d.delta_f, (N-1)*2) self._kmin = kmin @@ -252,9 +253,12 @@ def __init__(self, variable_params, data, waveform_generator, norm = 4*d.delta_f # we'll store the weight to apply to the inner product if psds is None: + self._psds = None w = Array(numpy.sqrt(norm)*numpy.ones(N)) self._weight = {det: w for det in data} else: + # store a copy of the psds + self._psds = {ifo: d.copy() for (ifo, d) in psds.items()} # temporarily suppress numpy divide by 0 warning numpysettings = numpy.seterr(divide='ignore') self._weight = {det: Array(numpy.sqrt(norm/psds[det])) @@ -432,3 +436,17 @@ def det_optimal_snrsq(self, det): self.loglr # now try returning again return getattr(self._current_stats, '{}_optimal_snrsq'.format(det)) + + def write_metadata(self, fp): + """Adds writing the psds and lognl, since it's a constant. + + The lognl is written to the sample group's ``attrs``. + """ + super(GaussianNoise, self).write_data(fp) + self.attrs['f_lower'] = self._f_lower + if self._psds is not None: + fp.write_psd(self, self._psds) + attrs = fp[fp.samples_group].attrs + attrs['lognl'] = self.lognl + for det in self.detectors: + attrs['{}_lognl'.format(det)] = self.det_lognl(det) From f81edab1911c8c1dcb6066f4329b423e79a202a2 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Mon, 16 Jul 2018 18:15:03 -0400 Subject: [PATCH 15/47] move setting up checkpoint and run interval to sampler methods --- bin/gwin | 306 ++++++++------------------------------ gwin/io/base_hdf.py | 4 - gwin/sampler/base.py | 122 ++++++++++++++- gwin/sampler/base_mcmc.py | 250 +++++++++++++++++++++++++++---- 4 files changed, 403 insertions(+), 279 deletions(-) diff --git a/bin/gwin b/bin/gwin index 2d0439b..6cf268d 100644 --- a/bin/gwin +++ b/bin/gwin @@ -39,11 +39,39 @@ from gwin.calibration import Recalibrate # command line usage parser = argparse.ArgumentParser(usage=__file__ + " [--options]", description=__doc__) - -# version option parser.add_argument("--version", action="version", version=__version__, help="Prints version information.") - +parser.add_argument("--verbose", action="store_true", default=False, + help="Print logging messages.") +# output options +parser.add_argument("--output-file", type=str, required=True, + help="Output file path.") +parser.add_argument("--force", action="store_true", default=False, + help="If the output-file already exists, overwrite it. " + "Otherwise, an OSError is raised.") +parser.add_argument("--save-backup", action="store_true", + default=False, + help="Don't delete the backup file after the run has " + "completed.") +# run duration options +parser.add_argument("--nsamples", type=int, required=True, + help="The number of samples the sampler should get. " + "The sampler will run until it has acquired at least " + "this many samples. Depending on checkpoint settings " + "it may go over.") +parser.add_argument("--require-indep-samples", action="store_true", + default=False, + help="Require that the number of samples set by nsamples " + "be independent. If this is not set, MCMC samplers " + "will just run until they have the desried number of " + "raw samples (with no thinning).") +parser.add_argument("--samples-file", default=None, + help="Use an iteration from an InferenceFile as the " + "initial proposal distribution. The same " + "number of walkers and the same [variable_params] " + "section in the configuration file should be used. " + "The priors must allow encompass the initial " + "positions from the InferenceFile being read.") # add data options parser.add_argument("--instruments", type=str, nargs="+", help="IFOs, eg. H1 L1.") @@ -57,57 +85,8 @@ parser.add_argument("--psd-end-time", type=float, default=None, parser.add_argument("--seed", type=int, default=0, help="Seed to use for the random number generator that " "initially distributes the walkers. Default is 0.") -parser.add_argument("--samples-file", default=None, - help="Use an iteration from an InferenceFile as the " - "initial proposal distribution. The same " - "number of walkers and the same [variable_params] " - "section in the configuration file should be used. " - "The priors must allow encompass the initial " - "positions from the InferenceFile being read.") - -# add sampler options -option_utils.add_sampler_option_group(parser) - # add config options option_utils.add_config_opts_to_parser(parser) - -# output options -parser.add_argument("--output-file", type=str, required=True, - help="Output file path.") -parser.add_argument("--force", action="store_true", default=False, - help="If the output-file already exists, overwrite it. " - "Otherwise, an OSError is raised.") -parser.add_argument("--save-strain", action="store_true", default=False, - help="Save the conditioned strain time series to the " - "output file. If gate-overwhitened, this is done " - "before all gates have been applied.") -parser.add_argument("--save-stilde", action="store_true", default=False, - help="Save the conditioned strain frequency series to " - "the output file. This is done after all gates have " - "been applied.") -parser.add_argument("--save-psd", action="store_true", default=False, - help="Save the psd of each ifo to the output file.") -parser.add_argument("--checkpoint-interval", type=int, default=None, - help="Number of iterations to take before saving new " - "samples to file, calculating ACL, and updating " - "burn-in estimate.") -parser.add_argument("--resume-from-checkpoint", action="store_true", - default=False, - help="Automatically load results from checkpoint/backup " - "file.") -parser.add_argument("--save-backup", action="store_true", - default=False, - help="Don't delete the backup file after the run has " - "completed.") -parser.add_argument("--checkpoint-fast", action="store_true", - help="Do not calculate ACL after each checkpoint, only at " - "the end. Not applicable if n-independent-samples " - "have been specified.") - -# verbose option -parser.add_argument("--verbose", action="store_true", default=False, - help="Print logging messages.") - # add module pre-defined options fft.insert_fft_option_group(parser) opt.insert_optimization_option_group(parser) @@ -131,41 +110,6 @@ scheme.verify_processing_options(opts, parser) #strain.verify_strain_options(opts, parser) weave.verify_weave_options(opts, parser) -# check for the output file -if os.path.exists(opts.output_file) and not opts.force: - raise OSError("output-file already exists; use --force if you wish to " - "overwrite it.") - -# check for backup file(s) -checkpoint_file = opts.output_file + '.checkpoint' -backup_file = opts.output_file + '.bkup' -checkpoint_valid = validate_checkpoint_files(checkpoint_file, backup_file) - -# determine what to do with checkpoints -if checkpoint_valid and not opts.resume_from_checkpoint and not opts.force: - raise OSError("valid checkpoint file {} found, but " - "resume-from-checkpoint not on. If you wish to overwrite " - "use --force; otherwise, use --resume-from-checkpoint") -if not opts.resume_from_checkpoint and opts.force: - checkpoint_valid = False - -# check for how many iterations to run -max_iterations = opts.niterations -if opts.niterations is not None and opts.n_independent_samples is not None: - raise ValueError("Must specify either niterations or n-independent-" - "samples, not both") -elif opts.niterations is not None: - get_nsamples = opts.niterations -elif opts.n_independent_samples is not None: - if opts.checkpoint_interval is None: - raise ValueError("n-independent-samples requires a checkpoint-" - "interval; see help") - get_nsamples = opts.n_independent_samples -else: - raise ValueError("Must specify niterations or n-independent-samples; " - "see --help") - - # set seed numpy.random.seed(opts.seed) logging.info("Using seed %i", opts.seed) @@ -218,41 +162,22 @@ with ctx: logging.info("Setting up sampler") - # create sampler that will run - sampler = option_utils.sampler_from_cli(opts, model) - - # save information about this data and settings - if not checkpoint_valid: - with InferenceFile(checkpoint_file, "w") as fp: - # save command line and data - logging.info("Creating and writing data to output file") - fp.write_data( - strain_dict=strain_dict if opts.save_strain else None, - stilde_dict=stilde_dict if opts.save_stilde else None, - psd_dict=psd_dict if opts.save_psd else None, - low_frequency_cutoff_dict=low_frequency_cutoff_dict) - - # save injection parameters - if opts.injection_file: - for ifo in opts.instruments: - logging.info("Writing %s injections to output file", ifo) - if ifo in opts.injection_file.keys(): - inj_file = opts.injection_file[ifo] - elif len(opts.injection_file) == 1: - inj_file = opts.injection_file.values()[0] - else: - logging.warn("Could not find injections for %s", ifo) - continue - fp.write_injections(opts.injection_file.values()[0], ifo) - # copy to backup - shutil.copy(checkpoint_file, backup_file) - - # write the command line, resume point - for fn in [checkpoint_file, backup_file]: - with InferenceFile(fn, "a") as fp: - fp.write_command_line() - if checkpoint_valid: - fp.write_resume_point() + # Create sampler that will run. + # Note: the pool is created at this point. This means that, + # unless you enjoy angering your cluster admins, + # NO SAMPLES FILE IO SHOULD BE DONE PRIOR TO THIS POINT!!! + sampler = gwin.sampler.load_from_config( + cp, model, nprocesses=opts.nprocesses, use_mpi=opts.use_mpi) + + # set up output/checkpoint file + # Note: PyCBC's multi-ifo parser uses key:ifo for + # the injection file, even though we will use the same + # injection file all detectors. This + # should be fixed in a future version of PyCBC. Once it is, + # update this. Until then, just use the first file. + injection_file = opts.injection_file.values()[0] # None if not set + sampler.setup_output(opts.output_file, force=opts.force, + injection_file=injetion_file) # set the walkers initial positions from a pre-existing InferenceFile # or a specific initial distribution listed in the configuration file @@ -260,12 +185,12 @@ with ctx: logging.info("Setting walkers initial conditions for varying parameters") samples_file = opts.samples_file # use the checkpoint file instead if resume from checkpoint - if opts.resume_from_checkpoint and checkpoint_valid: - samples_file = checkpoint_file + if sampler.checkpoint_valid: + samples_file = sampler.checkpoint_file if samples_file is not None: logging.info("Initial positions taken from last iteration in %s", samples_file) - samples_file = InferenceFile(samples_file, "r") + samples_file = sampler.io(samples_file, "r") init_prior = None elif len(cp.get_subsections("initial")): initial_dists = distributions.read_distributions_from_config( @@ -276,127 +201,18 @@ with ctx: *initial_dists, **{"constraints" : constraints}) else: init_prior = None - sampler.set_p0(samples_file=samples_file, prior=init_prior) - # if getting samples from file then put sampler and random number generator - # back in its former state - if samples_file is not None: - sampler.set_state_from_file(samples_file) - samples_file.close() - - # run sampler's burn in if it is in the list of burn in functions - if "use_sampler" in burn_in_eval.burn_in_functions: - # remove the sampler's burn in so we don't run more than once - burn_in_eval.burn_in_functions.pop("use_sampler") - # we'll only do this if we don't have a valid checkpoint: since the - # checkpoint happens after the sampler's burn in, the sampler's burn in - # must have already run if we have a valid checkpoint file - if not checkpoint_valid: - with InferenceFile(checkpoint_file, "a") as fp: - logging.info("Running sampler's burn in function") - burnidx, is_burned_in = burn_in.use_sampler(sampler, fp) - sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) - # write the burn in results - logging.info("Writing burn in samples to file") - sampler.write_results(fp, static_params=model.static_params, - ifos=opts.instruments) - # write to backup file - with InferenceFile(backup_file, "a") as fp: - sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) - sampler.write_results(fp, static_params=model.static_params, - ifos=opts.instruments) - - - # get the starting number of samples: - # nsamples keeps track of the number of samples we've obtained (if - # --n-independent-samples is used, this is the number of independent - # samples; otherwise, this is the number of iterations); - # start is the number of iterations that the file already contains (either - # due to sampler burn-in, or a previous checkpoint) - try: - with InferenceFile(checkpoint_file, "r") as fp: - start = fp.niterations - except KeyError: - start = 0 - if opts.n_independent_samples is not None: - try: - with InferenceFile(checkpoint_file, "r") as fp: - nsamples = fp.n_independent_samples - except AttributeError: - nsamples = start - else: - nsamples = start - # to ensure iterations are counted properly, he sampler's lastclear should - # be the same as start - sampler.lastclear = start - - interval = opts.checkpoint_interval - if interval is None: - interval = get_nsamples - - # run sampler until we have the desired number of samples - while nsamples < get_nsamples: - - end = start + interval - - # adjust the interval if we would go past the number of iterations - if opts.n_independent_samples is None and end > get_nsamples: - interval = get_nsamples - start - end = start + interval - - # run sampler and set initial values to None so that sampler - # picks up from where it left off next call - logging.info("Running sampler for {} to {} iterations".format(start, - end)) - sampler.run(interval) - - # write new samples - with InferenceFile(checkpoint_file, "a") as fp: - - logging.info("Writing results to file") - sampler.write_results(fp, static_params=model.static_params, - ifos=opts.instruments) - logging.info("Updating burn in") - burnidx, is_burned_in = burn_in_eval.update(sampler, fp) - - # compute the acls and write - acls = None - if opts.n_independent_samples is not None or end >= get_nsamples \ - or not opts.checkpoint_fast: - logging.info("Computing acls") - acls = sampler.compute_acls(fp) - sampler.write_acls(fp, acls) - - # write to backup - with InferenceFile(backup_file, "a") as fp: - - logging.info("Writing to backup file") - sampler.write_results(fp, static_params=model.static_params, - ifos=opts.instruments) - sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) - if acls is not None: - sampler.write_acls(fp, acls) - - # check validity - checkpoint_valid = validate_checkpoint_files(checkpoint_file, - backup_file) - if not checkpoint_valid: - raise IOError("error writing to checkpoint file") - - # update nsamples for next loop - if opts.n_independent_samples is not None: - with InferenceFile(checkpoint_file, 'r') as fp: - nsamples = fp.n_independent_samples - logging.info("Have {} independent samples".format(nsamples)) - else: - nsamples += interval - - - # clear the in-memory chain to save memory - logging.info("Clearing chain") - sampler.clear_chain() - - start = end + sampler.set_initial_conditions(intial_distribution=init_prior, + samples_file=samples_file) + + # Set the target number of samples for the sampler + sampler.set_target(opts.nsamples, opts.require_indep_samples) + + # Run the sampler + sampler.run() + + # finalize and exit + sampler.finalize() # compute evidence, if supported with InferenceFile(checkpoint_file, 'a') as fp: diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index 413ced4..2601a93 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -229,7 +229,6 @@ def cmd(self): cmd = cmd[-1] return cmd - def write_metadata(self, sampler, **kwargs): """Writes the sampler's metadata. @@ -247,9 +246,6 @@ def write_metadata(self, sampler, **kwargs): # write the model's metadata sampler.model.write_metadata(self) write_kwargs_to_hdf_attrs(self.attrs, **kwargs) - # FIXME: what will write this? - #fp.attrs["lognl"] = self.model.lognl - # add the static params to the kwargs def write_logevidence(self, lnz, dlnz): """Writes the given log evidence and its error. diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py index 38b5029..12dcc24 100644 --- a/gwin/sampler/base.py +++ b/gwin/sampler/base.py @@ -57,7 +57,8 @@ def __init__(self, model): #@classmethod # uncomment when we move to python 3.3 @abstractmethod - def from_config(cls, cp, model, pool=None, model_call=None, **kwargs): + def from_config(cls, cp, model, nprocesses=1, use_mpi=False, + **kwargs): """This should initialize the sampler given a config file. """ pass @@ -81,9 +82,9 @@ def static_params(self): return self.model.static_params @abstractproperty - def raw_samples(self): - """A dict mapping sampling_params to arrays of samples currently - in memory. + def samples(self): + """A dict mapping variable_params to arrays of samples currently + in memory. The dictionary may also contain sampling_params. The sample arrays may have any shape, and may or may not be thinned. """ @@ -116,9 +117,122 @@ def io(self): """ pass + @abstractmethod + def set_initial_conditions(self, initial_distribution=None, + samples_file=None): + """Sets up the starting point for the sampler. + + Should also set the sampler's random state. + """ + pass + @abstractmethod def checkpoint(self): """The sampler must have a checkpoint method for dumping raw samples and stats to the file type defined by ``io``. """ pass + + def setup_output(self, output_file, force=False, injection_file=None): + """Sets up the sampler's checkpoint and output files. + + The checkpoint file has the same name as the output file, but with + ``.checkpoint`` appended to the name. A backup file will also be + created. + + If the output file already exists, an ``OSError`` will be raised. + This can be overridden by setting ``force`` to ``True``. + + Parameters + ---------- + sampler : sampler instance + Sampler + output_file : str + Name of the output file. + force : bool, optional + If the output file already exists, overwrite it. + injection_file : str, optional + If an injection was added to the data, write its information. + """ + # check for backup file(s) + checkpoint_file = output_file + '.checkpoint' + backup_file = output_file + '.bkup' + # check if we have a good checkpoint and/or backup file + checkpoint_valid = validate_checkpoint_files(checkpoint_file, + backup_file) + # Create a new file if the checkpoint doesn't exist, or if it is + # corrupted + if not checkpoint_valid: + self.create_new_output_file(checkpoint_file, force=force, + injection_file=injection_file) + # now the checkpoint is valid + checkpoint_valid = True + # copy to backup + shutil.copy(checkpoint_file, backup_file) + # write the command line + for fn in [checkpoint_file, backup_file]: + with sampler.io(fn, "a") as fp: + fp.write_command_line() + # store + self.checkpoint_file = checkpoint_file + self.backup_file = backup_file + self.checkpoint_valid = checkpoint_valid + + def set_target(self, nsamples, require_independent=False): + """Sets the number of samples the sampler should try to acquire. + + If the ``must_be_independent`` flag is set, then the number of samples + must be independent. This means, for example, that MCMC chains are + thinned by their ACL before counting samples. Otherwise, the sampler + will just run until it has the requested number of samples, regardless + of thinning. + + Parameters + ---------- + nsamples : int + The number of samples to acquire. + must_be_independent : bool, optional + Add the requirement that the target number of samples be + independent. Default is False. + """ + self.target_nsamples = nsamples + self.require_indep_samples = require_independent + + + +def create_new_output_file(sampler, filename, force=False, injection_file=None, + **kwargs): + """Creates a new output file. + + If the output file already exists, an ``OSError`` will be raised. This can + be overridden by setting ``force`` to ``True``. + + Parameters + ---------- + sampler : sampler instance + Sampler + filename : str + Name of the file to create. + force : bool, optional + Create the file even if it already exists. Default is False. + injection_file : str, optional + If an injection was added to the data, write its information. + \**kwargs : + All other keyword arguments are passed through to the file's + ``write_metadata`` function. + """ + if os.path.exists(filename): + if force: + os.remove(filename) + else: + raise OSError("output-file already exists; use force if you " + "wish to overwrite it.") + logging.info("Creating file {}".format(filename)) + with sampler.io(filename, "w") as fp: + # save the sampler's metadata + fp.write_metadata(sampler) + # save injection parameters + if injection_file is not None: + logging.info("Writing injection file to output") + # just use the first one + fp.write_injections(injection_file) diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index 69beb75..65d4e86 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -21,10 +21,89 @@ # # ============================================================================= # -"""Provides constructor classes for MCMC samplers.""" +"""Provides constructor classes and convenience functions for MCMC samplers.""" from abc import ABCMeta, abstractmethod, abstractproperty +# +# ============================================================================= +# +# Convenience functions +# +# ============================================================================= +# +def raw_samples_to_dict(sampler, raw_samples): + """Convenience function for converting ND array to a dict of samples. + + The samples are assumed to have dimension + ``[sampler.base_shape x] niterations x len(sampler.sampling_params)``. + + Parameters + ---------- + sampler : sampler instance + An instance of an MCMC sampler. + raw_samples : array + The array of samples to convert. + + Returns + ------- + dict : + A dictionary mapping the raw samples to the variable params. If the + sampling params are not the same as the variable params, they will + also be included. Each array will have shape + ``[sampler.base_shape x] niterations``. + """ + sampling_params = sampler.sampling_params + # convert to dictionary + samples = {param: raw_samples[..., ii] for + ii, param in enumerate(sampling_params)} + # apply boundary conditions + samples = sampler.model.prior_distribution.apply_boundary_conditions( + **samples) + # apply transforms to go to model's variable params space + return sampler.model.sampling_transforms.apply(samples, inverse=True) + + +def raw_stats_to_dict(sampler, raw_stats): + """Converts an ND array of model stats to a dict. + + The ``raw_stats`` may either be a numpy array or a list. If the + former, the stats are assumed to have shape + ``[sampler.base_shape x] niterations x nstats, where nstats are the number + of stats returned by ``sampler.model.default_stats``. If the latter, the + list is cast to an array that is assumed to be the same shape as if an + array was given. + + Parameters + ---------- + sampler : sampler instance + An instance of an MCMC sampler. + raw_stats : array or list + The stats to convert. + + Returns + ------- + dict : + A dictionary mapping the model's ``default_stats`` to arrays of values. + Each array will have shape ``[sampler.base_shape x] niterations``. + """ + if not isinstance(raw_stats, numpy.ndarray): + # Assume list. Since the model returns a tuple of values, this should + # be a [sampler.base_shape x] x niterations list of tuples. We can + # therefore immediately convert this to a ND array. + raw_stats = numpy.array(raw_stats) + return {stat: raw_stats[..., ii] + for (ii, stat) in enumerate(self.model.default_stats)} + +# +# ============================================================================= +# +# BaseMCMC definition +# +# ============================================================================= +# + + class BaseMCMC(object): """This class provides methods common to MCMCs. @@ -43,15 +122,21 @@ class BaseMCMC(object): """ __metaclass__ = ABCMeta - lastclear = None + _lastclear = None _itercounter = None _pos = None _p0 = None _nwalkers = None @abstractproperty(self): - def samples_shape(self): - """Should define what shape to expect samples to be in.""" + def base_shape(self): + """What shape the sampler's samples arrays are in, excluding + the iterations dimension. + + For example, if a sampler uses 20 walkers and 3 temperatures, this + would be ``(3, 20)``. If a sampler only uses a single walker and no + temperatures this would be ``()``. + """ pass @property @@ -67,7 +152,7 @@ def niterations(self): itercounter = self._itercounter if _itercounter is None: itercounter = 0 - lastclear = self.lastclear + lastclear = self._lastclear if lastclear is None: lastclear = 0 return itercounter + lastclear @@ -119,11 +204,11 @@ def set_p0(self, samples_file=None, prior=None): samples = fp.read_samples(self.variable_params, iteration=-1) # make sure we have the same shape - assert(samples.shape == self.samples_shape, + assert(samples.shape[:-1] == self.samples_shape, "samples in file {} have shape {}, but I have shape {}". format(samples_file, samples.shape, self.samples_shape)) # transform to sampling parameter space - samples = self.model.apply_sampling_transforms(samples) + samples = self.model.sampling_transforms.apply(samples) # draw random samples if samples are not provided else: nsamples = numpy.prod(self.samples_shape) @@ -137,29 +222,142 @@ def set_p0(self, samples_file=None, prior=None): self._p0 = p0 return self.p0 - @classmethod - def n_independent_samples(cls, fp): - """Returns the number of independent samples stored in a file. + def set_initial_conditions(self, initial_distribution=None, + samples_file=None): + """Sets the initial starting point for the MCMC. - The number of independent samples are counted starting from after - burn-in. If the sampler hasn't burned in yet, then 0 is returned. + If a starting samples file is provided, will also load the random + state from it. + """ + self.set_p0(samples_file=samples_file, prior=initial_distribution) + # if a samples file was provided, use it to set the state of the + # sampler + if samples_file is not None: + self.set_state_from_file(samples_file) - Parameters - ----------- - fp : InferenceFile - An open file handler to read. + @abstractmethod + def set_state_from_file(self, filename): + """Sets the state of the sampler to the instance saved in a file. + """ + pass - Returns - ------- - int - The number of independent samples. + @abstractmethod + def write_state(self, filename): + """Saves the state of the sampler to the given file. """ - # check if burned in - if not fp.is_burned_in: - return 0 - # we'll just read a single parameter from the file - samples = cls.read_samples(fp, fp.variable_params[0]) - return samples.size + pass + + def run(self): + """Runs the sampler.""" + + if self.require_indep_samples and self.checkpoint_interval is None: + raise ValueError("A checkpoint interval must be set if " + "independent samples are required") + # figure out how many iterations I need to run for: this is the target + # number of samples / the number of walkers + target_niters = self.target_nsamples / self.nwalkers + + # get the starting number of samples: + # "nsamples" keeps track of the number of samples we've obtained (if + # require_indep_samples is used, this is the number of independent + # samples; otherwise, this is the total number of samples). + # "startiter" is the number of iterations that the file already contains + # (either due to sampler burn-in, or a previous checkpoint) + try: + with self.io(self.checkpoint_file, "r") as fp: + start = fp.niterations + except KeyError: + startiter = 0 + if self.require_indep_samples: + with self.io(self.checkpoint_file, "r") as fp: + nsamples = fp.n_independent_samples + else: + # the number of samples is the number of iterations times the + # number of walkers + nsamples = startiter * self.nwalkers + + # to ensure iterations are counted properly, the sampler's lastclear + # should be the same as start + self._lastclear = startiter + + iterinterval = self.checkpoint_interval + if iterinterval is None: + iterinterval = int(numpy.ceil( + float(self.target_nsamples) / self.nwalkers)) + + # run sampler until we have the desired number of samples + while nsamples < self.target_nsamples: + + enditer = startiter + iterinterval + + # adjust the interval if we would go past the number of iterations + endnsamp = enditer * self.nwalkers + if endnsamp > self.target_nsamples \ + and not self.require_indep_samples: + iterinterval = int(numpy.ceil( + (endnsamp - self.target_nsamples) / self.nwalkers)) + + # run sampler and set initial values to None so that sampler + # picks up from where it left off next call + logging.info("Running sampler for {} to {} iterations".format( + startiter, enditer)) + self.run_mcmc(iterinterval) + + # update nsamples for next loop + if opts.n_independent_samples is not None: + with InferenceFile(checkpoint_file, 'r') as fp: + nsamples = fp.n_independent_samples + logging.info("Have {} independent samples".format(nsamples)) + else: + nsamples += interval + + + # clear the in-memory chain to save memory + logging.info("Clearing chain") + sampler.clear_chain() + + start = end + + @abstractmethod + def run_for_niterations(self, niterations): + """Run the MCMC for the given number of iterations.""" + pass + + def checkpoint(self): + """Dumps current samples to the checkpoint file.""" + # write new samples + with self.io(checkpoint_file, "a") as fp: + + logging.info("Writing samples to file") + sampler.write_results(fp, static_params=model.static_params, + ifos=opts.instruments) + logging.info("Updating burn in") + burnidx, is_burned_in = burn_in_eval.update(sampler, fp) + + # compute the acls and write + acls = None + if opts.n_independent_samples is not None or end >= get_nsamples \ + or not opts.checkpoint_fast: + logging.info("Computing acls") + acls = sampler.compute_acls(fp) + sampler.write_acls(fp, acls) + + # write to backup + with InferenceFile(backup_file, "a") as fp: + + logging.info("Writing to backup file") + sampler.write_results(fp, static_params=model.static_params, + ifos=opts.instruments) + sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) + if acls is not None: + sampler.write_acls(fp, acls) + + # check validity + checkpoint_valid = validate_checkpoint_files(checkpoint_file, + backup_file) + if not checkpoint_valid: + raise IOError("error writing to checkpoint file") + @classmethod def compute_acfs(cls, fp, start_index=None, end_index=None, From 2f9a2b2c3879ebbb6f6bfa0236057ee5fd8b5abb Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Mon, 16 Jul 2018 19:56:08 -0400 Subject: [PATCH 16/47] rearrange read/write functions; add checkpoint and finalize methods; add run method to base_mcmc --- gwin/io/base_hdf.py | 22 +--- gwin/io/base_mcmc.py | 18 +-- gwin/io/emcee.py | 10 +- gwin/sampler/base.py | 59 +++++++++- gwin/sampler/base_mcmc.py | 67 +++++------ gwin/sampler/emcee.py | 241 ++++++++++++++++---------------------- 6 files changed, 199 insertions(+), 218 deletions(-) diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index 2601a93..39fd96a 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -204,11 +204,11 @@ def static_params(self): return {arg: self.attrs[arg] for arg in self.attrs["static_params"]} @property - def n_independent_samples(self): + def n_indep_samples(self): """Returns the number of independent samples stored in the file. """ try: - return self.attrs['n_independent_samples'] + return self.attrs['n_indep_samples'] except KeyError: return 0 @@ -229,24 +229,6 @@ def cmd(self): cmd = cmd[-1] return cmd - def write_metadata(self, sampler, **kwargs): - """Writes the sampler's metadata. - - Parameters - ---------- - sampler : gwin.sampler - An instance of a gwin sampler. - **kwargs : - All keyword arguments are saved as separate arguments in the - file attrs. If any keyword argument is a dictionary, the keyword - will point to the list of keys in the the file's ``attrs``. Each - key is then stored as a separate attr with its corresponding value. - """ - self.attrs['sampler'] = samlper.name - # write the model's metadata - sampler.model.write_metadata(self) - write_kwargs_to_hdf_attrs(self.attrs, **kwargs) - def write_logevidence(self, lnz, dlnz): """Writes the given log evidence and its error. diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py index a597c6d..302aed3 100644 --- a/gwin/io/base_mcmc.py +++ b/gwin/io/base_mcmc.py @@ -51,18 +51,7 @@ def read_acls(self): """ pass - def write_mcmc_metadata(self, sampler): - """Writes metadata unique to an ensemble MCMC. - - Parameters - ---------- - sampler : gwin.sampler - An instance of a gwin sampler. - """ - self.attrs["niterations"] = sampler.niterations - self.attrs["nwalkers"] = sampler.nwalkers - - def write_samples(self, parameters, samples, + def write_samples(self, samples, parameters=None, start_iteration=None, max_iterations=None): """Writes samples to the given file. @@ -75,11 +64,12 @@ def write_samples(self, parameters, samples, Parameters ----------- - parameters : list - The parameters to write to the file. samples : dict The samples to write. Each array in the dictionary should have shape nwalkers x niterations. + parameters : list, optional + Only write the specified parameters to the file. If None, will + write all of the keys in the ``samples`` dict. start_iteration : int, optional Write results to the file's datasets starting at the given iteration. Default is to append after the last iteration in the diff --git a/gwin/io/emcee.py b/gwin/io/emcee.py index c127da5..2376c64 100644 --- a/gwin/io/emcee.py +++ b/gwin/io/emcee.py @@ -33,12 +33,10 @@ class EmceeFile(EnsembleMCMCIO, BaseInferenceFile): name = 'emcee_file' def read_acceptance_fraction(self, walkers=None): - """Reads the acceptance fraction from the given file. + """Reads the acceptance fraction. Parameters ----------- - fp : InferenceFile - An open file handler to read the samples from. walkers : {None, (list of) int} The walker index (or a list of indices) to retrieve. If None, samples from all walkers will be obtained. @@ -58,12 +56,12 @@ def read_acceptance_fraction(self, walkers=None): def write_acceptance_fraction(self, acceptance_fraction): """Write acceptance_fraction data to file. Results are written to - `fp[acceptance_fraction]`. + the ``[sampler_group]/acceptance_fraction``. Parameters ----------- - fp : InferenceFile - A file handler to an open inference file. + acceptance_fraction : numpy.ndarray + Array of acceptance fractions to write. """ group = self.sampler_group + '/acceptance_fraction' try: diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py index 12dcc24..1a2718b 100644 --- a/gwin/sampler/base.py +++ b/gwin/sampler/base.py @@ -36,7 +36,7 @@ # # ============================================================================= # -# Samplers +# Base Sampler definition # # ============================================================================= # @@ -133,6 +133,24 @@ def checkpoint(self): """ pass + @abstractmethod + def finalize(self): + """Do any finalization to the samples file before exiting.""" + pass + + def write_metadata(self, fp): + """Writes metadata about the sampler to the given filehandler.""" + fp.attrs['sampler'] = self.name + # write the model's metadata + self.model.write_metadata(fp) + self._write_more_metadata(fp) + + def _write_more_metadata(self, fp): + """Optional method that can be implemented if a sampler wants to write + more metadata than just its name and the model's metadata. + """ + pass + def setup_output(self, output_file, force=False, injection_file=None): """Sets up the sampler's checkpoint and output files. @@ -199,6 +217,13 @@ def set_target(self, nsamples, require_independent=False): self.require_indep_samples = require_independent +# +# ============================================================================= +# +# Convenience functions +# +# ============================================================================= +# def create_new_output_file(sampler, filename, force=False, injection_file=None, **kwargs): @@ -230,9 +255,39 @@ def create_new_output_file(sampler, filename, force=False, injection_file=None, logging.info("Creating file {}".format(filename)) with sampler.io(filename, "w") as fp: # save the sampler's metadata - fp.write_metadata(sampler) + sampler.write_metadata(fp) # save injection parameters if injection_file is not None: logging.info("Writing injection file to output") # just use the first one fp.write_injections(injection_file) + +def intial_dist_from_config(cp): + """Loads a distribution for the sampler start from the given config file. + + A distribution will only be loaded if the config file has a [initial-*] + section(s). + + Parameters + ---------- + cp : Config parser + The config parser to try to load from. + + Returns + ------- + JointDistribution or None : + The initial distribution. If no [initial-*] section found in the + config file, will just return None. + """ + if len(cp.get_subsections("initial")): + logging.info("Using a different distribution for the starting points " + "than the prior.") + initial_dists = distributions.read_distributions_from_config( + cp, section="initial") + constraints = distributions.read_constraints_from_config(cp, + constraint_section="initial_constraint") + init_dist = distributions.JointDistribution(sampler.variable_params, + *initial_dists, **{"constraints" : constraints}) + else: + init_dist = None + return init_dist diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index 65d4e86..51f7ef8 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -24,6 +24,8 @@ """Provides constructor classes and convenience functions for MCMC samplers.""" from abc import ABCMeta, abstractmethod, abstractproperty +import logging +import numpy # # ============================================================================= @@ -253,10 +255,6 @@ def run(self): if self.require_indep_samples and self.checkpoint_interval is None: raise ValueError("A checkpoint interval must be set if " "independent samples are required") - # figure out how many iterations I need to run for: this is the target - # number of samples / the number of walkers - target_niters = self.target_nsamples / self.nwalkers - # get the starting number of samples: # "nsamples" keeps track of the number of samples we've obtained (if # require_indep_samples is used, this is the number of independent @@ -270,83 +268,82 @@ def run(self): startiter = 0 if self.require_indep_samples: with self.io(self.checkpoint_file, "r") as fp: - nsamples = fp.n_independent_samples + nsamples = fp.n_indep_samples else: # the number of samples is the number of iterations times the # number of walkers nsamples = startiter * self.nwalkers - # to ensure iterations are counted properly, the sampler's lastclear # should be the same as start self._lastclear = startiter - + # keep track of the number of iterations we've done + self._itercounter = startiter + # figure out the interval to use iterinterval = self.checkpoint_interval if iterinterval is None: iterinterval = int(numpy.ceil( float(self.target_nsamples) / self.nwalkers)) - # run sampler until we have the desired number of samples while nsamples < self.target_nsamples: - enditer = startiter + iterinterval - # adjust the interval if we would go past the number of iterations endnsamp = enditer * self.nwalkers if endnsamp > self.target_nsamples \ and not self.require_indep_samples: iterinterval = int(numpy.ceil( (endnsamp - self.target_nsamples) / self.nwalkers)) - # run sampler and set initial values to None so that sampler # picks up from where it left off next call logging.info("Running sampler for {} to {} iterations".format( startiter, enditer)) + # run the underlying sampler for the desired interval self.run_mcmc(iterinterval) - + # dump the current results + self.checkpoint() # update nsamples for next loop - if opts.n_independent_samples is not None: - with InferenceFile(checkpoint_file, 'r') as fp: - nsamples = fp.n_independent_samples + if self.require_indep_samples: + nsamples = self.n_indep_samples logging.info("Have {} independent samples".format(nsamples)) else: - nsamples += interval - + nsamples += iterinterval * self.nwalkers + self._itercounter = startiter = enditer - # clear the in-memory chain to save memory - logging.info("Clearing chain") - sampler.clear_chain() - - start = end + @abstractproperty + def n_indep_samples(self): + """Should return the number of independent samples the sampler has + acquired so far.""" + pass @abstractmethod - def run_for_niterations(self, niterations): + def run_mcmc(self, niterations): """Run the MCMC for the given number of iterations.""" pass def checkpoint(self): """Dumps current samples to the checkpoint file.""" # write new samples + logging.info("Writing samples to file") + self.write_results(self.checkpoint_file) + # write other stuff with self.io(checkpoint_file, "a") as fp: - - logging.info("Writing samples to file") - sampler.write_results(fp, static_params=model.static_params, - ifos=opts.instruments) - logging.info("Updating burn in") - burnidx, is_burned_in = burn_in_eval.update(sampler, fp) + # write the current number of iterations + fp.attrs['niterations'] = self.niterations + # FIXME + # logging.info("Updating burn in") + # burnidx, is_burned_in = burn_in_eval.update(self, fp) # compute the acls and write acls = None - if opts.n_independent_samples is not None or end >= get_nsamples \ - or not opts.checkpoint_fast: + if self.require_indep_samples: logging.info("Computing acls") - acls = sampler.compute_acls(fp) + acls = self.compute_acls(fp) sampler.write_acls(fp, acls) # write to backup with InferenceFile(backup_file, "a") as fp: logging.info("Writing to backup file") - sampler.write_results(fp, static_params=model.static_params, + sampler.write_results(fp, ifos=opts.instruments) sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) if acls is not None: @@ -358,6 +355,10 @@ def checkpoint(self): if not checkpoint_valid: raise IOError("error writing to checkpoint file") + # clear the in-memory chain to save memory + logging.info("Clearing chain") + self.clear_chain() + @classmethod def compute_acfs(cls, fp, start_index=None, end_index=None, diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py index 7cad975..e01ce7a 100644 --- a/gwin/sampler/emcee.py +++ b/gwin/sampler/emcee.py @@ -29,10 +29,13 @@ from __future__ import absolute_import import numpy +import emcee from pycbc.io import FieldArray from pycbc.filter import autocorrelation +from pycbc.pool import choose_pool -from .base import BaseMCMCSampler +from .base import BaseSampler +from .base_mcmc import (BaseMCMC, raw_samples_to_dict, raw_stats_to_dict) # @@ -60,158 +63,103 @@ class EmceeEnsembleSampler(BaseMCMC, BaseSampler): """ name = "emcee" - def __init__(self, model, nwalkers, pool=None, - model_call=None): - try: - import emcee - except ImportError: - raise ImportError("emcee is not installed.") + def __init__(self, model, outfile, nwalkers, + checkpoint_interval=None, resume_from_checkpoint=True, + n_independent_samples=None, niterations=None, + logpost_function=None, + nprocesses=1, use_mpi=False): + + self.model = model + # create a wrapper for calling the model + if logpost_function is None: + logpost_function = ='logposterior' + model_call = models.CallModel(model, logpost_function) + + # Set up the pool + if nprocesses > 1: + # these are used to help paralleize over multiple cores / MPI + models._global_instance = model_call + model_call = models._call_global_model + pool = choose_pool(mpi=use_mpi, processes=nprocesses) + if pool is not None: + pool.count = nprocesses + + self.outfile = outfile + self._nwalkers = nwalkers - if model_call is None: - model_call = model + # set up checkpointing + self.setup_checkpoint(outfile, + resume_from_checkpoint=resume_from_checkpoint) + # set up emcee ndim = len(model.variable_params) - sampler = emcee.EnsembleSampler(nwalkers, ndim, - model_call, - pool=pool) + self._sampler = emcee.EnsembleSampler(nwalkers, ndim, model_call, + pool=pool) # emcee uses it's own internal random number generator; we'll set it # to have the same state as the numpy generator rstate = numpy.random.get_state() - sampler.random_state = rstate - self._sampler = sampler - self._nwalkers = nwalkers + self._sampler.random_state = rstate - @classmethod - def from_cli(cls, opts, model, pool=None, - model_call=None): - """Create an instance of this sampler from the given command-line - options. + @property + def io(self): + return EmceeFile - Parameters - ---------- - opts : ArgumentParser options - The options to parse. - model : LikelihoodEvaluator - The model to use with the sampler. + def _write_more_metadata(self, fp): + """Adds nwalkers to the metadata.""" + fp.attrs['nwalkers'] = self.nwalkers - Returns - ------- - EmceeEnsembleSampler - An emcee sampler initialized based on the given arguments. - """ - return cls(model, opts.nwalkers, - pool=pool, model_call=model_call) + @property + def base_shape(self): + return (self.nwalkers,) @property - def raw_samples(self): - """A dict mapping sampling_params to arrays of samples currently + def samples(self): + """A dict mapping ``variable_params`` to arrays of samples currently in memory. - The arrays have shape ``nwalkers`` x ``niterations``. + The arrays have shape ``nwalkers x niterations``. """ - # chain is a [additional dimensions x] niterations x ndim array - samples = self.chain - sampling_params = self.sampling_params - # convert to dictionary to apply boundary conditions - samples = {param: samples[..., ii] for - ii, param in enumerate(sampling_params)} - samples = self.model._prior.apply_boundary_conditions( - **samples) - # now convert to field array - samples = FieldArray.from_arrays([samples[param] - for param in sampling_params], - names=sampling_params) - # apply transforms to go to model params space - return self.model.apply_sampling_transforms( - samples, inverse=True) + # emcee stores samples to it's chain attribute as a + # nwalker x niterations x ndim array + raw_samples = self._sampler.chain + return raw_samples_to_dict(self, raw_samples) @property def model_stats(self): - """Returns the model stats as a FieldArray, with field names - corresponding to the type of data returned by the model. - The returned array has shape nwalkers x niterations. If no additional - stats were returned to the sampler by the model, returns - None. + """A dict mapping the model's ``default_stats`` to arrays of values. + + The returned array has shape ``nwalkers x niterations``. """ - stats = numpy.array(self._sampler.blobs) - if stats.size == 0: - return None - # we'll force arrays to float; this way, if there are `None`s in the - # blobs, they will be changed to `nan`s - arrays = {field: stats[..., fi].astype(float) - for fi, field in - enumerate(self.model.metadata_fields)} - return FieldArray.from_kwargs(**arrays).transpose() + return raw_samples_to_dict(self._sampler.blobs, raw_stats) - @property - def lnpost(self): - """Get the natural logarithm of the likelihood as an - nwalkers x niterations array. - """ - # emcee returns nwalkers x niterations - return self._sampler.lnprobability - - @property - def chain(self): - """Get all past samples as an nwalker x niterations x ndim array.""" - # emcee returns the chain as nwalker x niterations x ndim - return self._sampler.chain - - def clear_chain(self): - """Clears the chain and blobs from memory. + def clear_samples(self): + """Clears the samples and stats from memory. """ # store the iteration that the clear is occuring on - self.lastclear = self.niterations + self._lastclear = self.niterations # now clear the chain self._sampler.reset() self._sampler.clear_blobs() - def set_p0(self, samples_file=None, prior=None): - """Sets the initial position of the walkers. - - Parameters - ---------- - samples_file : InferenceFile, optional - If provided, use the last iteration in the given file for the - starting positions. - prior : JointDistribution, optional - Use the given prior to set the initial positions rather than - ``model``'s prior. - - Returns - ------- - p0 : array - An nwalkers x ndim array of the initial positions that were set. - """ - # we define set_p0 here to ensure that emcee's internal random number - # generator is set to numpy's after the distributions' rvs functions - # are called - super(EmceeEnsembleSampler, self).set_p0(samples_file=samples_file, - prior=prior) - # update the random state - self._sampler.random_state = numpy.random.get_state() - - def write_state(self, fp): - """Saves the state of the sampler in a file. - """ - fp.write_random_state(state=self._sampler.random_state) - - def set_state_from_file(self, fp): + def set_state_from_file(self, filename): """Sets the state of the sampler back to the instance saved in a file. """ - rstate = fp.read_random_state() + with self.io(filename, 'r') as fp: + rstate = fp.read_random_state() # set the numpy random state numpy.random.set_state(rstate) # set emcee's generator to the same state self._sampler.random_state = rstate - def run(self, niterations, **kwargs): + def run_mcmc(self, niterations, **kwargs): """Advance the ensemble for a number of samples. Parameters ---------- niterations : int - Number of samples to get from sampler. + Number of iterations to run the sampler for. + \**kwargs : + All other keyword arguments are passed to the emcee sampler. Returns ------- @@ -227,37 +175,44 @@ def run(self, niterations, **kwargs): if pos is None: pos = self.p0 res = self._sampler.run_mcmc(pos, niterations, **kwargs) - p, lnpost, rstate = res[0], res[1], res[2] + p, _, _ = res[0], res[1], res[2] # update the positions self._pos = p - return p, lnpost, rstate - def write_results(self, fp, start_iteration=None, - max_iterations=None, **metadata): - """Writes metadata, samples, model stats, and acceptance fraction - to the given file. See the write function for each of those for - details. + def write_results(self, filename): + """Writes samples, model stats, acceptance fraction, and random state + to the given file. Parameters ----------- - fp : InferenceFile - A file handler to an open inference file. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. - \**metadata : - All other keyword arguments are passed to ``write_metadata``. + filename : str + The file to write to. The file is opened using the ``io`` class + in an an append state. """ - self.write_metadata(fp, **metadata) - self.write_chain(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_model_stats(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_acceptance_fraction(fp) - self.write_state(fp) + with self.io(filename, 'a') as fp: + # write samples + fp.write_samples(self.samples, self.model.variable_params) + # write stats + fp.write_samples(self.model_stats) + # write accpetance + fp.write_acceptance_fraction(self._sampler.acceptance_fraction) + # write random state + fp.write_random_state(state=self._sampler.random_state) + + + @classmethod + def from_config(cls, cp, model, outfile, nprocesses=1, use_mpi=False): + """Loads the sampler from the given config file.""" + section = "sampler" + # check name + assert cp.get(section, "name") == cls.name, ( + "name in section [sampler] must match mine") + # get the number of walkers to use + nwalkers = int(cp.get(section, "nwalkers")) + if cp.has_option(section, "logpost-function"): + lnpost = cp.get(section, "logpost-function") + else: + lnpost = None + return cls(model, outfile, nwalkers, logpost_function=lnpost, + nprocesses=nprocesses, use_mpi=use_mpi) + From 866f39a8658beb58e873314e7917d7b94bdbccb6 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Mon, 16 Jul 2018 19:57:35 -0400 Subject: [PATCH 17/47] fix whitespace --- gwin/sampler/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py index 1a2718b..e0cb543 100644 --- a/gwin/sampler/base.py +++ b/gwin/sampler/base.py @@ -262,6 +262,7 @@ def create_new_output_file(sampler, filename, force=False, injection_file=None, # just use the first one fp.write_injections(injection_file) + def intial_dist_from_config(cp): """Loads a distribution for the sampler start from the given config file. From 5b90d77b89ab62eda5033d3ae5d8f11a9367a302 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Mon, 16 Jul 2018 21:04:45 -0400 Subject: [PATCH 18/47] add acl support --- gwin/io/base_hdf.py | 6 +- gwin/io/base_mcmc.py | 52 +++++++++- gwin/sampler/base_mcmc.py | 205 +++++++++++++++----------------------- gwin/sampler/emcee.py | 2 +- 4 files changed, 138 insertions(+), 127 deletions(-) diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index 39fd96a..855d6ac 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -144,7 +144,7 @@ def read_samples(self, parameters, array_class=None, **kwargs): and ``parse_parameters`` methods. If None, will return a ``FieldArray``. \**kwargs : - All other keyword arguments are passed to ``_read_samples_data``. + All other keyword arguments are passed to ``read_raw_samples``. Returns ------- @@ -157,7 +157,7 @@ def read_samples(self, parameters, array_class=None, **kwargs): # get the names of fields needed for the given parameters possible_fields = self[self.samples_group].keys() loadfields = array_class.parse_parameters(parameters, possible_fields) - samples = self._read_samples_data(loadfields, **kwargs) + samples = self.read_raw_samples(loadfields, **kwargs) # convert to FieldArray samples = array_class.from_kwargs(**samples) # add the static params @@ -166,7 +166,7 @@ def read_samples(self, parameters, array_class=None, **kwargs): return samples @abstractmethod - def _read_samples_data(self, fields, **kwargs): + def read_raw_samples(self, fields, **kwargs): """Low level function for reading datasets in the samples group. This should return a dictionary of numpy arrays. diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py index 302aed3..3c54d77 100644 --- a/gwin/io/base_mcmc.py +++ b/gwin/io/base_mcmc.py @@ -112,7 +112,7 @@ def write_samples(self, samples, parameters=None, dtype=float, fletcher32=True) fp[dataset_name][:, istart:istop] = samples[param] - def _read_samples_data(self, fields, + def read_raw_samples(self, fields, thin_start=None, thin_interval=None, thin_end=None, iteration=None, walkers=None, flatten=True): """Base function for reading samples. @@ -167,3 +167,53 @@ def write_resume_point(self): resume_pts.append(niterations) self.attrs["resume_points"] = resume_pts + def write_acls(self, acls): + """Writes the given autocorrelation lengths. + + The ACL of each parameter is saved to + ``[sampler_group]/acls/{param}']``. The maximum over all the + parameters is saved to the file's 'acl' attribute. + + Parameters + ---------- + acls : dict + A dictionary of ACLs keyed by the parameter. + + Returns + ------- + ACL + The maximum of the acls that was written to the file. + """ + group = self.sampler_group + '/acls/{}' + # write the individual acls + for param in acls: + try: + # we need to use the write_direct function because it's + # apparently the only way to update scalars in h5py + self[group.format(param)].write_direct( + numpy.array(acls[param])) + except KeyError: + # dataset doesn't exist yet + self[group.format(param)] = acls[param] + # write the maximum over all params + self.attrs['acl'] = numpy.array(acls.values()).max() + return self.attrs['acl'] + + def read_acls(self): + """Reads the acls of all the parameters. + + Parameters + ---------- + fp : InferenceFile + An open file handler to read the acls from. + + Returns + ------- + dict + A dictionary of the ACLs, keyed by the parameter name. + """ + group = self[self.sampler_group]['acls'] + return {param: group[param].value for param in group.keys()} + + + diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index 51f7ef8..8e1f06c 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -324,44 +324,54 @@ def checkpoint(self): # write new samples logging.info("Writing samples to file") self.write_results(self.checkpoint_file) - # write other stuff - with self.io(checkpoint_file, "a") as fp: - # write the current number of iterations - fp.attrs['niterations'] = self.niterations - # FIXME - # logging.info("Updating burn in") - # burnidx, is_burned_in = burn_in_eval.update(self, fp) - - # compute the acls and write - acls = None - if self.require_indep_samples: - logging.info("Computing acls") - acls = self.compute_acls(fp) - sampler.write_acls(fp, acls) - - # write to backup - with InferenceFile(backup_file, "a") as fp: - - logging.info("Writing to backup file") - sampler.write_results(fp, - ifos=opts.instruments) - sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) - if acls is not None: - sampler.write_acls(fp, acls) - + logging.info("Writing to backup file") + self.write_results(self.backup_file) + # compute the acls + acls = None + if self.require_indep_samples: + logging.info("Computing acls") + acls = self.compute_acls(self.checkpoint_file) + # FIXME: + # logging.info("Updating burn in") + # burnidx, is_burned_in = burn_in_eval.update(self, fp) + # write + for fn in [self.checkpoint_file, self.backup_file]: + with self.io(fn, "a") as fp: + # write the current number of iterations + fp.attrs['niterations'] = self.niterations + # FIXME: + #sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) + if acls is not None: + fp.write_acls(acls) # check validity - checkpoint_valid = validate_checkpoint_files(checkpoint_file, - backup_file) + checkpoint_valid = validate_checkpoint_files( + self.checkpoint_file, self.backup_file) if not checkpoint_valid: raise IOError("error writing to checkpoint file") - # clear the in-memory chain to save memory logging.info("Clearing chain") self.clear_chain() + @abstractmethod + def compute_acf(cls, filename, **kwargs): + """A method to compute the autocorrelation function of samples in the + given file.""" + pass + + @abstractmethod + def compute_acl(cls, filename, **kwargs): + """A method to compute the autocorrelation length of samples in the + given file.""" + pass + + + +class EnsembleMCMCAutocorrSupport(object): + """Provides class methods for calculating ensemble ACFs/ACLs. + """ @classmethod - def compute_acfs(cls, fp, start_index=None, end_index=None, + def compute_acfs(cls, filename, start_index=None, end_index=None, per_walker=False, walkers=None, parameters=None): """Computes the autocorrleation function of the model params in the given file. @@ -372,8 +382,8 @@ def compute_acfs(cls, fp, start_index=None, end_index=None, Parameters ----------- - fp : InferenceFile - An open file handler to read the samples from. + filename : str + Name of a samples file to compute ACFs for. start_index : {None, int} The start index to compute the acl from. If None, will try to use the number of burn-in iterations in the file; otherwise, will start @@ -392,39 +402,41 @@ def compute_acfs(cls, fp, start_index=None, end_index=None, Returns ------- - FieldArray - A ``FieldArray`` of the ACF vs iteration for each parameter. If - `per-walker` is True, the FieldArray will have shape + dict : + Dictionary of arrays giving the ACFs for each parameter. If + ``per-walker`` is True, the arrays will have shape ``nwalkers x niterations``. """ acfs = {} - if parameters is None: - parameters = fp.variable_params - if isinstance(parameters, str) or isinstance(parameters, unicode): - parameters = [parameters] - for param in parameters: - if per_walker: - # just call myself with a single walker - if walkers is None: - walkers = numpy.arange(fp.nwalkers) - arrays = [cls.compute_acfs(fp, start_index=start_index, - end_index=end_index, - per_walker=False, walkers=ii, - parameters=param)[param] - for ii in walkers] - acfs[param] = numpy.vstack(arrays) - else: - samples = cls.read_samples(fp, param, - thin_start=start_index, - thin_interval=1, thin_end=end_index, - walkers=walkers, - flatten=False)[param] - samples = samples.mean(axis=0) - acfs[param] = autocorrelation.calculate_acf(samples).numpy() - return FieldArray.from_kwargs(**acfs) + with cls.io(filename, 'r') as fp: + if parameters is None: + parameters = fp.variable_params + if isinstance(parameters, str) or isinstance(parameters, unicode): + parameters = [parameters] + for param in parameters: + if per_walker: + # just call myself with a single walker + if walkers is None: + walkers = numpy.arange(fp.nwalkers) + arrays = [ + cls.compute_acfs(filename, start_index=start_index, + end_index=end_index, + per_walker=False, walkers=ii, + parameters=param)[param] + for ii in walkers] + acfs[param] = numpy.vstack(arrays) + else: + samples = fp.read_raw_samples( + fp, param, thin_start=start_index, thin_interval=1, + thin_end=end_index, walkers=walkers, + flatten=False)[param] + samples = samples.mean(axis=0) + acfs[param] = autocorrelation.calculate_acf( + samples).numpy() + return acfs @classmethod - def compute_acls(cls, fp, start_index=None, end_index=None): + def compute_acls(cls, filename, start_index=None, end_index=None): """Computes the autocorrleation length for all model params in the given file. @@ -434,8 +446,8 @@ def compute_acls(cls, fp, start_index=None, end_index=None): Parameters ----------- - fp : InferenceFile - An open file handler to read the samples from. + filename : str + Name of a samples file to compute ACLs for. start_index : {None, int} The start index to compute the acl from. If None, will try to use the number of burn-in iterations in the file; otherwise, will start @@ -450,69 +462,18 @@ def compute_acls(cls, fp, start_index=None, end_index=None): A dictionary giving the ACL for each parameter. """ acls = {} - for param in fp.variable_params: - samples = cls.read_samples(fp, param, - thin_start=start_index, - thin_interval=1, thin_end=end_index, - flatten=False)[param] - samples = samples.mean(axis=0) - acl = autocorrelation.calculate_acl(samples) - if numpy.isinf(acl): - acl = samples.size - acls[param] = acl + with cls.io(filename, 'r') as fp: + for param in fp.variable_params: + samples = fp.read_raw_samples( + fp, param, thin_start=start_index, thin_interval=1, + thin_end=end_index, flatten=False)[param] + samples = samples.mean(axis=0) + acl = autocorrelation.calculate_acl(samples) + if numpy.isinf(acl): + acl = samples.size + acls[param] = acl return acls - @staticmethod - def write_acls(fp, acls): - """Writes the given autocorrelation lengths to the given file. - - The ACL of each parameter is saved to ``fp['acls/{param}']``. - The maximum over all the parameters is saved to the file's 'acl' - attribute. - - Parameters - ---------- - fp : InferenceFile - An open file handler to write the samples to. - acls : dict - A dictionary of ACLs keyed by the parameter. - - Returns - ------- - ACL - The maximum of the acls that was written to the file. - """ - group = 'acls/{}' - # write the individual acls - for param in acls: - try: - # we need to use the write_direct function because it's - # apparently the only way to update scalars in h5py - fp[group.format(param)].write_direct(numpy.array(acls[param])) - except KeyError: - # dataset doesn't exist yet - fp[group.format(param)] = acls[param] - # write the maximum over all params - fp.attrs['acl'] = numpy.array(acls.values()).max() - return fp.attrs['acl'] - - @staticmethod - def read_acls(fp): - """Reads the acls of all the parameters in the given file. - - Parameters - ---------- - fp : InferenceFile - An open file handler to read the acls from. - - Returns - ------- - dict - A dictionary of the ACLs, keyed by the parameter name. - """ - group = fp['acls'] - return {param: group[param].value for param in group.keys()} - class MCMCBurnInSupport(object): """Provides methods for estimating burn-in.""" diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py index e01ce7a..42663ca 100644 --- a/gwin/sampler/emcee.py +++ b/gwin/sampler/emcee.py @@ -46,7 +46,7 @@ # ============================================================================= # -class EmceeEnsembleSampler(BaseMCMC, BaseSampler): +class EmceeEnsembleSampler(EnsembleMCMCAutocorrSupport, BaseMCMC, BaseSampler): """This class is used to construct an MCMC sampler from the emcee package's EnsembleSampler. From 764c7411d79cf313091c4aed58c10674b08b375e Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Mon, 16 Jul 2018 21:05:17 -0400 Subject: [PATCH 19/47] update executable --- bin/gwin | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/bin/gwin b/bin/gwin index 6cf268d..9bf822c 100644 --- a/bin/gwin +++ b/bin/gwin @@ -157,8 +157,9 @@ with ctx: # construct class that will return the natural logarithm of likelihood model = gwin.models.read_from_config(cp, **model_args) - burn_in_eval = burn_in.BurnIn(opts.burn_in_function, - min_iterations=opts.min_burn_in) + # FIXME: move to MCMC sampler + #burn_in_eval = burn_in.BurnIn(opts.burn_in_function, + # min_iterations=opts.min_burn_in) logging.info("Setting up sampler") @@ -190,17 +191,10 @@ with ctx: if samples_file is not None: logging.info("Initial positions taken from last iteration in %s", samples_file) - samples_file = sampler.io(samples_file, "r") init_prior = None - elif len(cp.get_subsections("initial")): - initial_dists = distributions.read_distributions_from_config( - cp, section="initial") - constraints = distributions.read_constraints_from_config(cp, - constraint_section="initial_constraint") - init_prior = distributions.JointDistribution(sampler.variable_params, - *initial_dists, **{"constraints" : constraints}) else: - init_prior = None + # try to load an initial distribution from the config file + init_prior = gwin.sampler.inital_dist_from_config(cp) sampler.set_initial_conditions(intial_distribution=init_prior, samples_file=samples_file) @@ -211,24 +205,24 @@ with ctx: # Run the sampler sampler.run() - # finalize and exit + # Finalize the output sampler.finalize() - # compute evidence, if supported - with InferenceFile(checkpoint_file, 'a') as fp: - try: - lnz, dlnz = sampler.calculate_logevidence(fp) - logging.info("Saving evidence") - sampler.write_logevidence(fp, lnz, dlnz) - except NotImplementedError: - pass + # FIXME: move to emcee_pt's finalize method + #with InferenceFile(checkpoint_file, 'a') as fp: + # try: + # lnz, dlnz = sampler.calculate_logevidence(fp) + # logging.info("Saving evidence") + # sampler.write_logevidence(fp, lnz, dlnz) + # except NotImplementedError: + # pass # rename checkpoint to output and delete backup logging.info("Moving checkpoint to output") -os.rename(checkpoint_file, opts.output_file) +os.rename(sampler.checkpoint_file, opts.output_file) if not opts.save_backup: logging.info("Deleting backup file") - os.remove(backup_file) + os.remove(sampler.backup_file) # exit logging.info("Done") From 24a9b4f9b50d6db4c1487f3522cba8898b5f0020 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Mon, 16 Jul 2018 21:14:31 -0400 Subject: [PATCH 20/47] add finalize to emcee, fix typos --- gwin/sampler/base_mcmc.py | 4 ++-- gwin/sampler/emcee.py | 37 ++++++++++++++++++++++--------------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index 8e1f06c..8a49477 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -408,7 +408,7 @@ def compute_acfs(cls, filename, start_index=None, end_index=None, ``nwalkers x niterations``. """ acfs = {} - with cls.io(filename, 'r') as fp: + with cls._io(filename, 'r') as fp: if parameters is None: parameters = fp.variable_params if isinstance(parameters, str) or isinstance(parameters, unicode): @@ -462,7 +462,7 @@ def compute_acls(cls, filename, start_index=None, end_index=None): A dictionary giving the ACL for each parameter. """ acls = {} - with cls.io(filename, 'r') as fp: + with cls._io(filename, 'r') as fp: for param in fp.variable_params: samples = fp.read_raw_samples( fp, param, thin_start=start_index, thin_interval=1, diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py index 42663ca..9da1bba 100644 --- a/gwin/sampler/emcee.py +++ b/gwin/sampler/emcee.py @@ -62,17 +62,15 @@ class EmceeEnsembleSampler(EnsembleMCMCAutocorrSupport, BaseMCMC, BaseSampler): cores/nodes/etc. """ name = "emcee" + _io = EmceeFile - def __init__(self, model, outfile, nwalkers, - checkpoint_interval=None, resume_from_checkpoint=True, - n_independent_samples=None, niterations=None, - logpost_function=None, + def __init__(self, model, nwalkers, logpost_function=None, nprocesses=1, use_mpi=False): self.model = model # create a wrapper for calling the model if logpost_function is None: - logpost_function = ='logposterior' + logpost_function = 'logposterior' model_call = models.CallModel(model, logpost_function) # Set up the pool @@ -84,14 +82,8 @@ def __init__(self, model, outfile, nwalkers, if pool is not None: pool.count = nprocesses - self.outfile = outfile - self._nwalkers = nwalkers - - # set up checkpointing - self.setup_checkpoint(outfile, - resume_from_checkpoint=resume_from_checkpoint) - # set up emcee + self._nwalkers = nwalkers ndim = len(model.variable_params) self._sampler = emcee.EnsembleSampler(nwalkers, ndim, model_call, pool=pool) @@ -102,7 +94,7 @@ def __init__(self, model, outfile, nwalkers, @property def io(self): - return EmceeFile + return self._io def _write_more_metadata(self, fp): """Adds nwalkers to the metadata.""" @@ -200,8 +192,23 @@ def write_results(self, filename): fp.write_random_state(state=self._sampler.random_state) + def finalize(self): + """Finalize the samples file.""" + # Compute/write final ACL + acls = self.compute_acls(self.checkpoint_file) + # FIXME: + # logging.info("Updating burn in") + # burnidx, is_burned_in = burn_in_eval.update(self, fp) + # write + with self.io(self.checkpoint_file, "a") as fp: + # write the current number of iterations + fp.attrs['niterations'] = self.niterations + # FIXME: + #sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) + fp.write_acls(acls) + @classmethod - def from_config(cls, cp, model, outfile, nprocesses=1, use_mpi=False): + def from_config(cls, cp, model, nprocesses=1, use_mpi=False): """Loads the sampler from the given config file.""" section = "sampler" # check name @@ -213,6 +220,6 @@ def from_config(cls, cp, model, outfile, nprocesses=1, use_mpi=False): lnpost = cp.get(section, "logpost-function") else: lnpost = None - return cls(model, outfile, nwalkers, logpost_function=lnpost, + return cls(model, nwalkers, logpost_function=lnpost, nprocesses=nprocesses, use_mpi=use_mpi) From c35a28f177c1bd0bedc11e02bbeb7a5d6e8fdc75 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Tue, 17 Jul 2018 07:42:22 -0400 Subject: [PATCH 21/47] change write_posterior to expect filename, not file --- gwin/io/base_hdf.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index 855d6ac..06bbe34 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -103,8 +103,9 @@ def parse_parameters(self, parameters, array_class=None): to derive the virtual field or method), and/or a function of these. array_class : array class, optional - The type of array to use to parse the parameters. The class must have a - ``parse_parameters`` method. Default is to use a ``FieldArray``. + The type of array to use to parse the parameters. The class must + have a ``parse_parameters`` method. Default is to use a + ``FieldArray``. Returns ------- @@ -174,14 +175,14 @@ def read_raw_samples(self, fields, **kwargs): pass @abstractmethod - def write_posterior(self, posterior_fp, **kwargs): + def write_posterior(self, posterior_file, **kwargs): """This should write a posterior plus any other metadata to the given file. Parameters ---------- - posterior_fp : open hdf file - The file to write to. + posterior_file : str + Name of the file to write to. \**kwargs : Any other keyword args the sampler needs to write the posterior. """ From 26fc718b99aae43af6c02e6c5eaaa33b8d30d495 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Fri, 20 Jul 2018 10:34:50 -0400 Subject: [PATCH 22/47] change burn in module to just have functions --- gwin/burn_in.py | 420 +++++++++++------------------------------------- 1 file changed, 97 insertions(+), 323 deletions(-) diff --git a/gwin/burn_in.py b/gwin/burn_in.py index bcb4ef6..895ba44 100644 --- a/gwin/burn_in.py +++ b/gwin/burn_in.py @@ -19,374 +19,148 @@ """ import numpy - from scipy.stats import ks_2samp +from pycbc.filter import autocorrelation + -def ks_test(sampler, fp, threshold=0.9): - """Burn in based on whether the p-value of the KS test between the samples - at the last iteration and the samples midway along the chain for each - parameter is > ``threshold``. +def ks_test(samples1, samples2, threshold=0.9): + """Applies a KS test to determine if two sets of samples are the same. + + The ks test is applied parameter-by-parameter. If the two-tailed p-value + returned by the test is greater than ``threshold``, the samples are + considered to be the same. Parameters ---------- - sampler : gwin.sampler - Sampler to determine burn in for. May be either an instance of a - `gwin.sampler`, or the class itself. - fp : InferenceFile - Open inference hdf file containing the samples to load for determing - burn in. + samples1 : dict + Dictionary of mapping parameters to the first set of samples. + samples2 : dict + Dictionary of mapping parameters to the second set of samples. threshold : float The thershold to use for the p-value. Default is 0.9. Returns ------- - burn_in_idx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. + dict : + Dictionary mapping parameter names to booleans indicating whether the + given parameter passes the KS test. """ - nwalkers = fp.nwalkers - niterations = fp.niterations - # Create a dictionary which would have keys are the variable args and - # values are booleans indicating whether the p-value for the parameters - # satisfies the KS test - is_burned_in_param = {} + is_the_same = {} + assert set(samples1.keys()) == set(samples2.keys()), ( + "samples1 and 2 must have the same parameters") # iterate over the parameters - for param in fp.variable_params: - # read samples for the parameter from the last iteration of the chain - samples_last_iter = sampler.read_samples(fp, param, iteration=-1, - flatten=True)[param] - # read samples for the parameter from the iteration midway - # along the chain - samples_chain_midpt = sampler.read_samples( - fp, param, iteration=int(niterations/2), flatten=True)[param] + for param in samples1: + s1 = samples1[param] + s2 = samples2[param] _, p_value = ks_2samp(samples_last_iter, samples_chain_midpt) - # check if p_value is > than the desired range - is_burned_in_param[param] = p_value > threshold - - # The chains are burned in if the p-value of the KS test lies - # in the range [0.1,0.9] for all the parameters. - # If the KS test is passed, the chains have burned in at their - # mid-way point. - if all(is_burned_in_param.values()): - is_burned_in = numpy.ones(nwalkers, dtype=bool) - burn_in_idx = numpy.repeat(niterations/2, nwalkers).astype(int) - else: - is_burned_in = numpy.zeros(nwalkers, dtype=bool) - burn_in_idx = numpy.repeat(niterations, nwalkers).astype(int) - return burn_in_idx, is_burned_in + is_the_same[param] = p_value > threshold + return is_the_same -def n_acl(sampler, fp, nacls=10): +def n_acl(chain, nacls=5): """Burn in based on ACL. - The sampler is considered burned in if the number of itertions is >= - ``nacls`` times the maximum ACL over all parameters, as measured from the - first iteration. + This applies the following test to determine burn in: + + 1. The first half of the chain is ignored. + + 2. An ACL is calculated from the second half. + + 3. If ``nacls`` times the ACL is < the number of iterations / 2, + the chain is considered to be burned in at the half-way point. Parameters ---------- - sampler : pycbc.inference.sampler - Sampler to determine burn in for. May be either an instance of a - `inference.sampler`, or the class itself. - fp : InferenceFile - Open inference hdf file containing the samples to load for determing - burn in. - nacls : int - Number of ACLs to use for burn in. Default is 10. + chain : array + The chain of samples to apply the test to. Must be 1D. + nacls : int, optional + Number of ACLs to use for burn in. Default is 5. Returns ------- - burn_in_idx : array - Array of indices giving the burn-in index for each chain. By definition - of this function, all chains reach burn in at the same iteration. Thus - the returned array is the burn-in index repeated by the number of - chains. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. Since - all chains obtain burn in at the same time, this is either an array - of all False or True. + burn_in_idx : int + The burn in index. If the chain is not burned in, will be equal to the + length of the chain. + is_burned_in : bool + Whether or not the chain is burned in. + acl : int + The ACL that was estimated. """ - acl = numpy.array(sampler.compute_acls(fp, start_index=0).values()).max() - burn_idx = nacls * acl - is_burned_in = burn_idx < fp.niterations - if not is_burned_in: - burn_idx = fp.niterations - nwalkers = fp.nwalkers - return numpy.repeat(burn_idx, nwalkers).astype(int), \ - numpy.repeat(is_burned_in, nwalkers).astype(bool) + kstart = int(len(chain)/2.) + acl = autocorrelation.calculate_acl(chain[kstart:]) + is_burned_in = nacls * acl < kstart + if is_burned_in: + burn_in_idx = kstart + else: + burn_in_idx = len(chain) + return burn_in_idx, is_burned_in, acl -def max_posterior(sampler, fp): +def max_posterior(lnps_per_walker, dim): """Burn in based on samples being within dim/2 of maximum posterior. Parameters ---------- - sampler : gwin.sampler - Sampler to determine burn in for. May be either an instance of a - `gwin.sampler`, or the class itself. - fp : InferenceFile - Open inference hdf file containing the samples to load for determing - burn in. + lnps_per_walker : 2D array + Array of values that are proportional to the log posterior values. Must + have shape ``nwalkers x niterations``. + dim : float + The dimension of the parameter space. Returns ------- - burn_in_idx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. + burn_in_idx : array of int + The burn in indices of each walker. If a walker is not burned in, its + index will be be equal to the length of the chain. + is_burned_in : array of bool + Whether or not a walker is burned in. """ - # get the posteriors - # Note: multi-tempered samplers should just return the coldest chain by - # default - chain_stats = sampler.read_samples( - fp, ['loglr', 'logprior'], samples_group=fp.stats_group, - thin_interval=1, thin_start=0, thin_end=None, flatten=False) - chain_posteriors = chain_stats['loglr'] + chain_stats['logprior'] - dim = float(len(fp.variable_params)) - - # find the posterior to compare against - max_p = chain_posteriors.max() - criteria = max_p - dim/2 - nwalkers = chain_posteriors.shape[-2] - niterations = chain_posteriors.shape[-1] - burn_in_idx = numpy.repeat(niterations, nwalkers).astype(int) - is_burned_in = numpy.zeros(nwalkers, dtype=bool) - - # find the first iteration in each chain where the logplr has exceeded + if len(lnps_per_walker.shape) != 2: + raise ValueError("lnps_per_walker must have shape " + "nwalkers x niterations") + # find the value to compare against + max_p = lnps_per_walker.max() + criteria = max_p - dim/2. + nwalkers, niterations = lnps_per_walker.shape + burn_in_idx = numpy.empty(nwalkers, dtype=int) + is_burned_in = numpy.empty(nwalkers, dtype=bool) + # find the first iteration in each chain where the logpost has exceeded # max_p - dim/2 for ii in range(nwalkers): - chain = chain_posteriors[..., ii, :] - # numpy.where will return a tuple with multiple arrays if the chain is - # more than 1D (which can happen for multi-tempered samplers). Always - # taking the last array ensures we are looking at the indices that - # count out iterations - idx = numpy.where(chain >= criteria)[-1] - if idx.size != 0: - burn_in_idx[ii] = idx[0] - is_burned_in[ii] = True + chain = lnps_per_walker[ii,:] + passedidx = numpy.where(chain >= criteria)[0] + is_burned_in[ii] = is_burned_in = passedidx.size > 0 + if is_burned_in: + burn_in_idx[ii] = passedidx[0] + else: + burn_in_idx[ii] = niterations return burn_in_idx, is_burned_in -def posterior_step(sampler, fp): - """Burn in based on the last time a chain made a jump > dim/2. +def posterior_step(logposts, dim): + """Finds the last time a chain made a jump > dim/2. Parameters ---------- - sampler : gwin.sampler - Sampler to determine burn in for. May be either an instance of a - `gwin.sampler`, or the class itself. - fp : InferenceFile - Open inference hdf file containing the samples to load for determing - burn in. + logposts : array + 1D array of values that are proportional to the log posterior values. + dim : float + The dimension of the parameter space. Returns ------- - burn_in_idx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. - By definition of this function, all values are set to True. + int + The index of the last time the logpost made a jump > dim/2. If that + never happened, returns 0. """ - # get the posteriors - # Note: multi-tempered samplers should just return the coldest chain by - # default - chain_stats = sampler.read_samples( - fp, ['loglr', 'logprior'], samples_group=fp.stats_group, - thin_interval=1, thin_start=0, thin_end=None, flatten=False) - chain_posteriors = chain_stats['loglr'] + chain_stats['logprior'] - nwalkers = chain_posteriors.shape[-2] - dim = float(len(fp.variable_params)) - burn_in_idx = numpy.zeros(nwalkers).astype(int) + if logposts.ndim > 1: + raise ValueError("logposts must be a 1D array") criteria = dim/2. - - # find the last iteration in each chain where the logplr has - # jumped by more than dim/2 - for ii in range(nwalkers): - chain = chain_posteriors[..., ii, :] - dp = abs(numpy.diff(chain)) - idx = numpy.where(dp >= criteria)[-1] - if idx.size != 0: - burn_in_idx[ii] = idx[-1] + 1 - return burn_in_idx, numpy.ones(nwalkers, dtype=bool) - - -def half_chain(sampler, fp): - """Takes the second half of the iterations as post-burn in. - - Parameters - ---------- - sampler : gwin.sampler - This option is not used; it is just here give consistent API as the - other burn in functions. - fp : InferenceFile - Open inference hdf file containing the samples to load for determing - burn in. - - Returns - ------- - burn_in_idx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. - By definition of this function, all values are set to True. - """ - nwalkers = fp.nwalkers - niterations = fp.niterations - return ( - numpy.repeat(niterations/2, nwalkers).astype(int), - numpy.ones(nwalkers, dtype=bool), - ) - - -def use_sampler(sampler, fp=None): - """Uses the sampler's burn_in function. - - Parameters - ---------- - sampler : gwin.sampler - Sampler to determine burn in for. Must be an instance of an - `gwin.sampler` that has a `burn_in` function. - fp : InferenceFile, optional - This option is not used; it is just here give consistent API as the - other burn in functions. - - Returns - ------- - burn_in_idx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. - Since the sampler's burn in function will run until all chains - are burned, all values are set to True. - """ - sampler.burn_in() - return ( - sampler.burn_in_iterations, - numpy.ones(len(sampler.burn_in_iterations), dtype=bool), - ) - - -burn_in_functions = { - 'ks_test': ks_test, - 'n_acl': n_acl, - 'max_posterior': max_posterior, - 'posterior_step': posterior_step, - 'half_chain': half_chain, - 'use_sampler': use_sampler, - } - - -class BurnIn(object): - """Class to estimate the number of burn in iterations. - - Parameters - ---------- - function_names : list, optional - List of name of burn in functions to use. All names in the provided - list muset be in the `burn_in_functions` dict. If none provided, will - use no burn-in functions. - min_iterations : int, optional - Minimum number of burn in iterations to use. The burn in iterations - returned by evaluate will be the maximum of this value - and the values returned by the burn in functions provided in - `function_names`. Default is 0. - - Examples - -------- - Initialize a `BurnIn` instance that will use `max_posterior` and - `posterior_step` as the burn in criteria: - - >>> import gwin - >>> burn_in = gwin.BurnIn(['max_posterior', 'posterior_step']) - - Use this `BurnIn` instance to find the burn-in iteration of each walker - in an inference result file: - - >>> from pycbc.io import InferenceFile - >>> fp = InferenceFile('gwin.hdf', 'r') - >>> burn_in.evaluate(gwin.samplers[fp.sampler_name], fp) - array([11486, 11983, 11894, ..., 11793, 11888, 11981]) - - """ - - def __init__(self, function_names, min_iterations=0): - if function_names is None: - function_names = [] - self.min_iterations = min_iterations - self.burn_in_functions = {fname: burn_in_functions[fname] - for fname in function_names} - - def evaluate(self, sampler, fp): - """Evaluates sampler's chains to find burn in. - - Parameters - ---------- - sampler : gwin.sampler - Sampler to determine burn in for. May be either an instance of a - `gwin.sampler`, or the class itself. - fp : InferenceFile - Open inference hdf file containing the samples to load for - determing burn in. - - Returns - ------- - burnidx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. - """ - # if the number of iterations is < than the minimium desired, - # just return the number of iterations and all False - if fp.niterations < self.min_iterations: - return numpy.repeat(self.min_iterations, fp.nwalkers), \ - numpy.zeros(fp.nwalkers, dtype=bool) - # if the file already has burn in iterations saved, use those as a - # base - try: - burnidx = fp['burn_in_iterations'][:] - except KeyError: - # just use the minimum - burnidx = numpy.repeat(self.min_iterations, fp.nwalkers) - # start by assuming is burned in; the &= below will make this false - # if any test yields false - is_burned_in = numpy.ones(fp.nwalkers, dtype=bool) - if self.burn_in_functions != {}: - newidx = [] - for func in self.burn_in_functions.values(): - idx, state = func(sampler, fp) - newidx.append(idx) - is_burned_in &= state - newidx = numpy.vstack(newidx).max(axis=0) - # update the burn in idx if any test yields a larger iteration - mask = burnidx < newidx - burnidx[mask] = newidx[mask] - # if any burn-in idx are less than the min iterations, set to the - # min iterations - burnidx[burnidx < self.min_iterations] = self.min_iterations - return burnidx, is_burned_in - - def update(self, sampler, fp): - """Evaluates burn in and saves the updated indices to the given file. - - Parameters - ---------- - sampler : gwin.sampler - Sampler to determine burn in for. May be either an instance of a - `gwin.sampler`, or the class itself. - fp : InferenceFile - Open inference hdf file containing the samples to load for - determing burn in. - - Returns - ------- - burnidx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. - """ - burnidx, is_burned_in = self.evaluate(sampler, fp) - sampler.burn_in_iterations = burnidx - sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) - return burnidx, is_burned_in + dp = numpy.diff(logposts) + indices = numpy.where(dp >= criteria)[0] + if indices.size > 0: + idx = indices[-1] + 1 + else: + idx = 0 + return idx From 8d69b1598e80c60abe7d2a198cd672c89b3c60ad Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Fri, 20 Jul 2018 10:35:10 -0400 Subject: [PATCH 23/47] start to define burn in support class --- gwin/sampler/base_mcmc.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index 8a49477..ddc5289 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -476,7 +476,15 @@ def compute_acls(cls, filename, start_index=None, end_index=None): class MCMCBurnInSupport(object): - """Provides methods for estimating burn-in.""" + """Provides methods for estimating burn-in of an ensemble MCMC.""" + + def __init__(self, burn_in_tests): + self.burn_in_tests = burn_in_tests + + def _max_posterior(self, filename): + """Applies max posterior test to self.""" + with self.io(filename, 'r') as fp: + samples = self.read_samples() def write_burn_in_iterations(fp, burn_in_iterations, is_burned_in=None): """Writes the burn in iterations to the given file. From 8a6506a4ee21acc7796e652efa7ed402b5c193f0 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Mon, 30 Jul 2018 12:28:13 +0200 Subject: [PATCH 24/47] move burn in class to burn_in module; add evaluate --- gwin/burn_in.py | 145 +++++++++++++++++++++++++++++++++++++- gwin/sampler/base_mcmc.py | 43 +---------- 2 files changed, 146 insertions(+), 42 deletions(-) diff --git a/gwin/burn_in.py b/gwin/burn_in.py index 895ba44..d872860 100644 --- a/gwin/burn_in.py +++ b/gwin/burn_in.py @@ -13,6 +13,14 @@ # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# """ This modules provides classes and functions for determining when Markov Chains have burned in. @@ -23,7 +31,16 @@ from pycbc.filter import autocorrelation +# The value to use for a burn-in iteration if a chain is not burned in +NOT_BURNED_IN_ITER = -1 +# +# ============================================================================= +# +# Convenience functions +# +# ============================================================================= +# def ks_test(samples1, samples2, threshold=0.9): """Applies a KS test to determine if two sets of samples are the same. @@ -93,7 +110,7 @@ def n_acl(chain, nacls=5): if is_burned_in: burn_in_idx = kstart else: - burn_in_idx = len(chain) + burn_in_idx = NOT_BURNED_IN_ITER return burn_in_idx, is_burned_in, acl @@ -134,7 +151,7 @@ def max_posterior(lnps_per_walker, dim): if is_burned_in: burn_in_idx[ii] = passedidx[0] else: - burn_in_idx[ii] = niterations + burn_in_idx[ii] = NOT_BURNED_IN_ITER return burn_in_idx, is_burned_in @@ -164,3 +181,127 @@ def posterior_step(logposts, dim): else: idx = 0 return idx + + +# +# ============================================================================= +# +# Burn in classes +# +# ============================================================================= +# + +from pycbc.io.record import get_vars_from_arg + +class MCMCBurnInSupport(object): + """Provides methods for estimating burn-in of an ensemble MCMC.""" + + default_burn_in_iteration = -1 + + def __init__(self, sampler, burn_in_test, **kwargs): + self.sampler = sampler + # determine the burn-in tests that are going to be done + self.do_tests = get_vars_from_arg(burn_in_test) + self.burn_in_test = burn_in_test + self.burn_in_data = {t: {} for t in self.do_tests} + self.is_burned_in = False + self.burn_in_iteration = None + if 'nacl' in burn_in_tests: + # get the number of acls to use + self._nacls = kwargs.pop('nacls', 5) + if 'ks_test' in burn_in_tests: + self._ksthreshold = kwargs.pop('ks_threshold', 0.9) + + def max_posterior(self, filename): + """Applies max posterior test to self.""" + with sampler.io(filename, 'r') as fp: + samples = fp.read_raw_samples( + ['loglikelihood', 'logprior'], thin_start=0, thin_interval=1, + flatten=False) + logposts = samples['loglikelihood'] + samples['logprior'] + burn_in_idx, is_burned_in = burn_in.max_posterior( + logposts, len(self.variable_params)) + data = self.burn_in_data['max_posterior'] + # required things to store + data['is_burned_in'] = is_burned_in.all() + data['burn_in_iteration'] = burn_in_idx.max() + # additional info + data['iteration_per_walker'] = burn_in_idx + data['status_per_walker'] = is_burned_in + + def nacl(self, filename): + """Applies the nacl burn-in test""" + with sampler.io(filename, 'r') as fp: + niters = fp.niterations + kstart = int(niters / 2.) + acls = sampler.compute_acls(filename, start_index=kstart) + is_burned_in = {param: (self._nacls * acl) < kstart + for (param, acl) in acls.items()} + data = self.burn_in_data['nacl'] + # required things to store + data['is_burned_in'] = all(is_burned_in.values()) + if data['is_burned_in']: + data['burn_in_iteration'] = kstart + else: + data['burn_in_iteration'] = NOT_BURNED_IN_ITER + # additional information + data['status_per_parameter'] = is_burned_in + # since we calculated it, save the acls to the sampler + sampler.acls = acls + + def ks_test(self, filename): + """Applies ks burn-in test.""" + with sampler.io(filename, 'r') as fp: + niters = fp.niterations + # get the samples from the mid point + samples1 = fp.read_raw_samples( + ['loglikelihood', 'logprior'], iteration=int(niters/2.)) + # get the last samples + samples2 = fp.read_raw_samples( + ['loglikelihood', 'logprior'], iteration=-1) + # do the test + # is_the_same is a dictionary of params --> bool indicating whether or + # not the 1D marginal is the same at the half way point + is_the_same = ks_test(samples1, samples2, threshold=self.ks_threshold) + data = self.burn_in_data['ks_test'] + # required things to store + data['is_burned_in'] = all(is_the_same.values()) + if data['is_burned_in']: + data['burn_in_iteration'] = int(niters/2.) + else: + data['burn_in_iteration'] = NOT_BURNED_IN_ITER + # additional + data['status_per_parameter'] = is_the_same + + def evaluate(self, filename): + """Runs all of the burn-in tests.""" + for tst in self.tests_to_do: + getattr(self, tst)(filename) + # The iteration to use for burn-in depends on the logic in the burn-in + # test string. For example, if the test was 'max_posterior | nacl' and + # max_posterior burned-in at iteration 5000 while nacl burned in at + # iteration 6000, we'd want to use 5000 as the burn-in iteration. + # However, if the test was 'max_posterior & nacl', we'd want to use + # 6000 as the burn-in iteration. The code below handles all cases by + # doing the following: first, take the collection of burn in iterations + # from all the burn in tests that were applied. Next, cycle over the + # iterations in increasing order, checking which tests have burned in + # by that point. Then evaluate the burn-in string at that point to see + # if it passes, and if so, what the iteration is. The first point that + # the test passes is used as the burn-in iteration. + burn_in_iters = numpy.unique([self.data[t]['burn_in_iteration'] + for t in self.do_tests]) + burn_in_iters.sort() + for ii in burn_in_iters: + test_results = {t: (self.data[t]['is_burned_in'] & + self.data[t]['burn_in_iteration'] <= ii) + for t in self.do_tests} + is_burned_in = eval(self.burn_in_test, {"__builtins__": None}, + test_results) + if is_burned_in: + break + self.is_burned_in = is_burned_in + if is_burned_in: + self.burn_in_iteration = ii + else: + self.burn_in_iteration = NOT_BURNED_IN_ITER diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index ddc5289..83959bf 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -327,10 +327,10 @@ def checkpoint(self): logging.info("Writing to backup file") self.write_results(self.backup_file) # compute the acls - acls = None + self.acls = None if self.require_indep_samples: logging.info("Computing acls") - acls = self.compute_acls(self.checkpoint_file) + self.acls = self.compute_acls(self.checkpoint_file) # FIXME: # logging.info("Updating burn in") # burnidx, is_burned_in = burn_in_eval.update(self, fp) @@ -341,7 +341,7 @@ def checkpoint(self): fp.attrs['niterations'] = self.niterations # FIXME: #sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) - if acls is not None: + if self.acls is not None: fp.write_acls(acls) # check validity checkpoint_valid = validate_checkpoint_files( @@ -473,40 +473,3 @@ def compute_acls(cls, filename, start_index=None, end_index=None): acl = samples.size acls[param] = acl return acls - - -class MCMCBurnInSupport(object): - """Provides methods for estimating burn-in of an ensemble MCMC.""" - - def __init__(self, burn_in_tests): - self.burn_in_tests = burn_in_tests - - def _max_posterior(self, filename): - """Applies max posterior test to self.""" - with self.io(filename, 'r') as fp: - samples = self.read_samples() - - def write_burn_in_iterations(fp, burn_in_iterations, is_burned_in=None): - """Writes the burn in iterations to the given file. - - Parameters - ---------- - fp : InferenceFile - A file handler to an open inference file. - burn_in_iterations : array - Array of values giving the iteration of the burn in of each walker. - is_burned_in : array - Array of booleans indicating which chains are burned in. - """ - try: - fp['burn_in_iterations'][:] = burn_in_iterations - except KeyError: - fp['burn_in_iterations'] = burn_in_iterations - fp.attrs['burn_in_iterations'] = burn_in_iterations.max() - if is_burned_in is not None: - try: - fp['is_burned_in'][:] = is_burned_in - except KeyError: - fp['is_burned_in'] = is_burned_in - fp.attrs['is_burned_in'] = is_burned_in.all() - From 2711460e143c2d28a81d4a611005e004808843de Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Mon, 30 Jul 2018 12:28:45 +0200 Subject: [PATCH 25/47] add write burn in to io --- gwin/io/base_mcmc.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py index 3c54d77..f068d12 100644 --- a/gwin/io/base_mcmc.py +++ b/gwin/io/base_mcmc.py @@ -215,5 +215,18 @@ def read_acls(self): group = self[self.sampler_group]['acls'] return {param: group[param].value for param in group.keys()} - - + def write_burn_in(self, burn_in): + """Write the given burn-in data to the given filename.""" + group = self[self.sampler_group] + group.attrs['is_burned_in'] = burn_in.is_burned_in + group.attrs['burn_in_iteration'] = burn_in.burn_in_iteration + group.attrs['burn_in_test'] = burn_in.burn_in_test + # write individual test data + for tst in burn_in.burn_in_data: + key = 'burn_in_tests/{}'.format(tst) + try: + attrs = group[key].attrs + except KeyError: + group.create_group(key) + attrs = group[key].attrs + write_kwargs_to_hdf_attrs(attrs, **burn_in.burn_in_data[tst]) From 59201ffdb3b8816399b2b92187a7d41da28ea64f Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Mon, 30 Jul 2018 14:43:46 +0200 Subject: [PATCH 26/47] add from_config for burn-in class --- gwin/burn_in.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/gwin/burn_in.py b/gwin/burn_in.py index d872860..5e5793e 100644 --- a/gwin/burn_in.py +++ b/gwin/burn_in.py @@ -196,8 +196,6 @@ def posterior_step(logposts, dim): class MCMCBurnInSupport(object): """Provides methods for estimating burn-in of an ensemble MCMC.""" - default_burn_in_iteration = -1 - def __init__(self, sampler, burn_in_test, **kwargs): self.sampler = sampler # determine the burn-in tests that are going to be done @@ -208,9 +206,9 @@ def __init__(self, sampler, burn_in_test, **kwargs): self.burn_in_iteration = None if 'nacl' in burn_in_tests: # get the number of acls to use - self._nacls = kwargs.pop('nacls', 5) + self._nacls = int(kwargs.pop('nacls', 5)) if 'ks_test' in burn_in_tests: - self._ksthreshold = kwargs.pop('ks_threshold', 0.9) + self._ksthreshold = float(kwargs.pop('ks_threshold', 0.9)) def max_posterior(self, filename): """Applies max posterior test to self.""" @@ -305,3 +303,17 @@ def evaluate(self, filename): self.burn_in_iteration = ii else: self.burn_in_iteration = NOT_BURNED_IN_ITER + + @classmethod + def from_config(cls, cp, sampler): + """Loads burn in from section [sampler-burn_in].""" + section = 'sampler' + tag = 'burn_in' + burn_in_test = cp.get_opt_tag(section, 'burn-in-test', tag) + kwargs = {} + if cp.has_option_tag(section, 'nacl', tag): + kwargs['nacl'] = int(cp.get_opt_tag(section, 'nacl', tag)) + if cp.has_option_tag(section, 'ks-threshold', tag) + kwargs['ks_threshold'] = float( + cp.get_opt_tag(section, 'ks-threshold', tag) + return cls(sampler, burn_in_test, **kwargs) From 35a8408cd6d1e85553ac8b795ee172627f0e4bc3 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Mon, 30 Jul 2018 16:19:36 +0200 Subject: [PATCH 27/47] more support for burn-in, calculation of independent samples --- gwin/burn_in.py | 2 +- gwin/sampler/base_mcmc.py | 60 ++++++++++++++++++++++++++++----------- gwin/sampler/emcee.py | 30 +++++++++----------- 3 files changed, 58 insertions(+), 34 deletions(-) diff --git a/gwin/burn_in.py b/gwin/burn_in.py index 5e5793e..6e83bb5 100644 --- a/gwin/burn_in.py +++ b/gwin/burn_in.py @@ -193,7 +193,7 @@ def posterior_step(logposts, dim): from pycbc.io.record import get_vars_from_arg -class MCMCBurnInSupport(object): +class MCMCBurnInTests(object): """Provides methods for estimating burn-in of an ensemble MCMC.""" def __init__(self, sampler, burn_in_test, **kwargs): diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index 83959bf..120846c 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -129,6 +129,7 @@ class BaseMCMC(object): _pos = None _p0 = None _nwalkers = None + _burn_in = None @abstractproperty(self): def base_shape(self): @@ -259,8 +260,8 @@ def run(self): # "nsamples" keeps track of the number of samples we've obtained (if # require_indep_samples is used, this is the number of independent # samples; otherwise, this is the total number of samples). - # "startiter" is the number of iterations that the file already contains - # (either due to sampler burn-in, or a previous checkpoint) + # "startiter" is the number of iterations that the file already + # contains (either due to sampler burn-in, or a previous checkpoint) try: with self.io(self.checkpoint_file, "r") as fp: start = fp.niterations @@ -303,22 +304,44 @@ def run(self): # update nsamples for next loop if self.require_indep_samples: nsamples = self.n_indep_samples - logging.info("Have {} independent samples".format(nsamples)) + logging.info("Have {} independent samples post burn in".format( + nsamples)) else: nsamples += iterinterval * self.nwalkers self._itercounter = startiter = enditer - @abstractproperty + @propetry + def burn_in(self): + """The class for doing burn-in tests (if specified).""" + return self._burn_in + + def set_burn_in(self, burn_in): + """Sets the object to use for doing burn-in tests.""" + self._burn_in = burn_in + def n_indep_samples(self): - """Should return the number of independent samples the sampler has + """The number of independent samples post burn-in that the sampler has acquired so far.""" - pass + if self.acls is None: + acl = numpy.inf + else: + acl = numpy.array(self.acls.values()).max() + if self.burn_in is None: + niters = self.niterations + else: + niters = self.niterations - self.burn_in.burn_in_iteration + return self.nwalkers * int(niters // acl) @abstractmethod def run_mcmc(self, niterations): """Run the MCMC for the given number of iterations.""" pass + @abstractmethod + def write_results(self, filename): + """Should write all samples currently in memory to the given file.""" + pass + def checkpoint(self): """Dumps current samples to the checkpoint file.""" # write new samples @@ -326,23 +349,26 @@ def checkpoint(self): self.write_results(self.checkpoint_file) logging.info("Writing to backup file") self.write_results(self.backup_file) - # compute the acls + # check for burn in, compute the acls self.acls = None - if self.require_indep_samples: + if self.burn_in is not None: + logging.info("Updating burn in") + self.burn_in.evaluate(self.checkpoint_file) + # Compute acls; the burn_in test may have calculated an acl and saved + # it, in which case we don't need to do it again. + if self.acls is None: logging.info("Computing acls") self.acls = self.compute_acls(self.checkpoint_file) - # FIXME: - # logging.info("Updating burn in") - # burnidx, is_burned_in = burn_in_eval.update(self, fp) # write for fn in [self.checkpoint_file, self.backup_file]: with self.io(fn, "a") as fp: - # write the current number of iterations - fp.attrs['niterations'] = self.niterations - # FIXME: - #sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) + if self.burn_in is not None: + fp.write_burn_in(self.burn_in) if self.acls is not None: fp.write_acls(acls) + # write the current number of iterations + fp.attrs['niterations'] = self.niterations + fp.attrs['n_indep_samples'] = self.n_indep_samples # check validity checkpoint_valid = validate_checkpoint_files( self.checkpoint_file, self.backup_file) @@ -408,7 +434,7 @@ def compute_acfs(cls, filename, start_index=None, end_index=None, ``nwalkers x niterations``. """ acfs = {} - with cls._io(filename, 'r') as fp: + with cls.io(filename, 'r') as fp: if parameters is None: parameters = fp.variable_params if isinstance(parameters, str) or isinstance(parameters, unicode): @@ -462,7 +488,7 @@ def compute_acls(cls, filename, start_index=None, end_index=None): A dictionary giving the ACL for each parameter. """ acls = {} - with cls._io(filename, 'r') as fp: + with cls.io(filename, 'r') as fp: for param in fp.variable_params: samples = fp.read_raw_samples( fp, param, thin_start=start_index, thin_interval=1, diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py index 9da1bba..34a09f2 100644 --- a/gwin/sampler/emcee.py +++ b/gwin/sampler/emcee.py @@ -33,9 +33,11 @@ from pycbc.io import FieldArray from pycbc.filter import autocorrelation from pycbc.pool import choose_pool +from pycbc.workflow import ConfigParser from .base import BaseSampler from .base_mcmc import (BaseMCMC, raw_samples_to_dict, raw_stats_to_dict) +from ../ import burn_in # @@ -63,6 +65,7 @@ class EmceeEnsembleSampler(EnsembleMCMCAutocorrSupport, BaseMCMC, BaseSampler): """ name = "emcee" _io = EmceeFile + burn_in_class = burn_in.MCMCBurnInTests def __init__(self, model, nwalkers, logpost_function=None, nprocesses=1, use_mpi=False): @@ -191,21 +194,10 @@ def write_results(self, filename): # write random state fp.write_random_state(state=self._sampler.random_state) - def finalize(self): - """Finalize the samples file.""" - # Compute/write final ACL - acls = self.compute_acls(self.checkpoint_file) - # FIXME: - # logging.info("Updating burn in") - # burnidx, is_burned_in = burn_in_eval.update(self, fp) - # write - with self.io(self.checkpoint_file, "a") as fp: - # write the current number of iterations - fp.attrs['niterations'] = self.niterations - # FIXME: - #sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) - fp.write_acls(acls) + """All data is written by the last checkpoint in the run method, so + this just passes.""" + pass @classmethod def from_config(cls, cp, model, nprocesses=1, use_mpi=False): @@ -220,6 +212,12 @@ def from_config(cls, cp, model, nprocesses=1, use_mpi=False): lnpost = cp.get(section, "logpost-function") else: lnpost = None - return cls(model, nwalkers, logpost_function=lnpost, + obj = cls(model, nwalkers, logpost_function=lnpost, nprocesses=nprocesses, use_mpi=use_mpi) - + # add burn-in if it's specified + try: + bit = obj.burn_in_class.from_config(cp, obj) + except ConfigParser.NoSectionError: + bit = None + obj.set_burn_in(bit) + return obj From c0eb5c66a1edbf30b6b397c79f48cd00c1d0be91 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Mon, 30 Jul 2018 17:43:06 +0200 Subject: [PATCH 28/47] add thin_start/interval/end to the hdf file attrs --- gwin/io/base_hdf.py | 57 ++++++++++++++++++++++++++++++++------- gwin/io/base_mcmc.py | 10 ++++--- gwin/sampler/base_mcmc.py | 1 - 3 files changed, 54 insertions(+), 14 deletions(-) diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index 06bbe34..007c3bc 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -213,6 +213,41 @@ def n_indep_samples(self): except KeyError: return 0 + @property + def thin_start(self): + """The default start index to use when reading samples. + + This tries to read from ``thin_start`` in the ``attrs``. If it isn't + there, just returns 0.""" + try: + return self.attrs['thin_start'] + except KeyError: + return 0 + + @property + def thin_interval(self): + """The default interval to use when reading samples. + + This tries to read from ``thin_interval`` in the ``attrs``. If it + isn't there, just returns 1. + """ + try: + return self.attrs['thin_interval'] + except KeyError: + return 1 + + @property + def thin_end(self): + """The defaut end index to use when reading samples. + + This tries to read from ``thin_end`` in the ``attrs``. If it isn't + there, just returns None. + """ + try: + return self.attrs['thin_end'] + except KeyError: + return None + @property def cmd(self): """Returns the (last) saved command line. @@ -557,7 +592,7 @@ def copy(self, other, ignore=None, parameters=None, parameter_names=None, # check that we're not trying to overwrite this file if other == self.name: raise IOError("destination is the same as this file") - other = InferenceFile(other, 'w') + other = self.__class__(other, 'w') # metadata self.copy_metadata(other) # info @@ -572,15 +607,17 @@ def copy(self, other, ignore=None, parameters=None, parameter_names=None, parameter_names=parameter_names, read_args=read_args, write_args=write_args) - # if any down selection was done, re-set the burn in iterations and - # the acl, and the niterations. - # The last dimension of the samples returned by the sampler should - # be the number of iterations. - #if samples.shape[-1] != self.niterations: - # other.attrs['acl'] = 1 - # other.attrs['burn_in_iterations'] = 0 - # other.attrs['niterations'] = samples.shape[-1] - #return other + # if any down selection was done, re-set the default + # thin-start/interval/end + p = self[self.samples_group].keys()[0] + my_shape = self[self.samples_group][p].shape + p = other[other.samples_group].keys()[0] + other_shape = other[other.samples_group][p].shape + if my_shape != other_shape: + other.attrs['thin_start'] = 0 + other.attrs['thin_interval'] = 1 + other.attrs['thin_end'] = None + return other def write_kwargs_to_hdf_attrs(attrs, **kwargs): diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py index f068d12..651c67b 100644 --- a/gwin/io/base_mcmc.py +++ b/gwin/io/base_mcmc.py @@ -196,8 +196,10 @@ def write_acls(self, acls): # dataset doesn't exist yet self[group.format(param)] = acls[param] # write the maximum over all params - self.attrs['acl'] = numpy.array(acls.values()).max() - return self.attrs['acl'] + acl = numpy.array(acls.values()).max() + self.attrs['acl'] = acl + # set the default thin interval to be the acl + self.attrs['thin_interval'] = acl def read_acls(self): """Reads the acls of all the parameters. @@ -218,9 +220,11 @@ def read_acls(self): def write_burn_in(self, burn_in): """Write the given burn-in data to the given filename.""" group = self[self.sampler_group] + group.attrs['burn_in_test'] = burn_in.burn_in_test group.attrs['is_burned_in'] = burn_in.is_burned_in group.attrs['burn_in_iteration'] = burn_in.burn_in_iteration - group.attrs['burn_in_test'] = burn_in.burn_in_test + # set the defaut thin_start to be the burn_in_iteration + self.attrs['thin_start'] = burn_in.burn_in_iteration # write individual test data for tst in burn_in.burn_in_data: key = 'burn_in_tests/{}'.format(tst) diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index 120846c..954e40f 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -391,7 +391,6 @@ def compute_acl(cls, filename, **kwargs): pass - class EnsembleMCMCAutocorrSupport(object): """Provides class methods for calculating ensemble ACFs/ACLs. """ From eead8a805ebea4358a5432b6afd2e2b56ee37143 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Tue, 31 Jul 2018 10:28:00 +0200 Subject: [PATCH 29/47] fix typos, whitespace in burn_in --- gwin/burn_in.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/gwin/burn_in.py b/gwin/burn_in.py index 6e83bb5..f7dca6a 100644 --- a/gwin/burn_in.py +++ b/gwin/burn_in.py @@ -30,10 +30,12 @@ from scipy.stats import ks_2samp from pycbc.filter import autocorrelation +from pycbc.io.record import get_vars_from_arg # The value to use for a burn-in iteration if a chain is not burned in NOT_BURNED_IN_ITER = -1 + # # ============================================================================= # @@ -41,6 +43,8 @@ # # ============================================================================= # + + def ks_test(samples1, samples2, threshold=0.9): """Applies a KS test to determine if two sets of samples are the same. @@ -145,7 +149,7 @@ def max_posterior(lnps_per_walker, dim): # find the first iteration in each chain where the logpost has exceeded # max_p - dim/2 for ii in range(nwalkers): - chain = lnps_per_walker[ii,:] + chain = lnps_per_walker[ii, :] passedidx = numpy.where(chain >= criteria)[0] is_burned_in[ii] = is_burned_in = passedidx.size > 0 if is_burned_in: @@ -191,7 +195,6 @@ def posterior_step(logposts, dim): # ============================================================================= # -from pycbc.io.record import get_vars_from_arg class MCMCBurnInTests(object): """Provides methods for estimating burn-in of an ensemble MCMC.""" @@ -260,10 +263,10 @@ def ks_test(self, filename): # do the test # is_the_same is a dictionary of params --> bool indicating whether or # not the 1D marginal is the same at the half way point - is_the_same = ks_test(samples1, samples2, threshold=self.ks_threshold) + is_the_same = ks_test(samples1, samples2, threshold=self.ks_threshold) data = self.burn_in_data['ks_test'] # required things to store - data['is_burned_in'] = all(is_the_same.values()) + data['is_burned_in'] = all(is_the_same.values()) if data['is_burned_in']: data['burn_in_iteration'] = int(niters/2.) else: @@ -315,5 +318,5 @@ def from_config(cls, cp, sampler): kwargs['nacl'] = int(cp.get_opt_tag(section, 'nacl', tag)) if cp.has_option_tag(section, 'ks-threshold', tag) kwargs['ks_threshold'] = float( - cp.get_opt_tag(section, 'ks-threshold', tag) + cp.get_opt_tag(section, 'ks-threshold', tag)) return cls(sampler, burn_in_test, **kwargs) From e765c129faae4f431f581ec8341c4ce490d220ef Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Tue, 31 Jul 2018 10:33:42 +0200 Subject: [PATCH 30/47] fix whitespace, typos in base_hdf --- gwin/io/base_hdf.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index 007c3bc..e3d9a00 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -41,9 +41,10 @@ from .. import sampler as gwin_sampler + class BaseInferenceFile(h5py.File): """Base class for all inference hdf files. - + This is a subclass of the h5py.File object. It adds functions for handling reading and writing the samples from the samplers. @@ -67,7 +68,7 @@ def __init__(self, path, mode=None, **kwargs): def __getattr__(self, attr): """Things stored in ``.attrs`` are promoted to instance attributes. - + Note that properties will be called before this, so if there are any properties that share the same name as something in ``.attrs``, that property will get returned. @@ -162,7 +163,7 @@ def read_samples(self, parameters, array_class=None, **kwargs): # convert to FieldArray samples = array_class.from_kwargs(**samples) # add the static params - for p,val in self.static_params.items(): + for (p, val) in self.static_params.items(): setattr(samples, p, val) return samples @@ -612,7 +613,7 @@ def copy(self, other, ignore=None, parameters=None, parameter_names=None, p = self[self.samples_group].keys()[0] my_shape = self[self.samples_group][p].shape p = other[other.samples_group].keys()[0] - other_shape = other[other.samples_group][p].shape + other_shape = other[other.samples_group][p].shape if my_shape != other_shape: other.attrs['thin_start'] = 0 other.attrs['thin_interval'] = 1 @@ -622,7 +623,7 @@ def copy(self, other, ignore=None, parameters=None, parameter_names=None, def write_kwargs_to_hdf_attrs(attrs, **kwargs): """Writes the given keywords to the given ``attrs``. - + If any keyword argument points to a dict, the keyword will point to a list of the dict's keys. Each key is then written to the attrs with its corresponding value. From ab40ad04168e4dbb93c278dfc873bbfb4fc51647 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Tue, 31 Jul 2018 10:34:03 +0200 Subject: [PATCH 31/47] rename EnsembleMCMCIO to MCMCIO; fix whitespace --- gwin/io/base_mcmc.py | 7 ++++--- gwin/io/emcee.py | 5 ++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py index 651c67b..ef834dc 100644 --- a/gwin/io/base_mcmc.py +++ b/gwin/io/base_mcmc.py @@ -40,7 +40,8 @@ from .hdf import InferenceFile -class EnsembleMCMCIO(obect): + +class MCMCIO(obect): """Abstract base class that provides some IO functions for ensemble MCMCs. """ __metaclass__ = ABCMeta @@ -113,8 +114,8 @@ def write_samples(self, samples, parameters=None, fp[dataset_name][:, istart:istop] = samples[param] def read_raw_samples(self, fields, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None, walkers=None, flatten=True): + thin_start=None, thin_interval=None, thin_end=None, + iteration=None, walkers=None, flatten=True): """Base function for reading samples. Parameters diff --git a/gwin/io/emcee.py b/gwin/io/emcee.py index 2376c64..e2ad663 100644 --- a/gwin/io/emcee.py +++ b/gwin/io/emcee.py @@ -27,7 +27,8 @@ from .base_hdf import BaseInferenceFile from .base_mcmc import EnsembleMCMCIO -class EmceeFile(EnsembleMCMCIO, BaseInferenceFile): + +class EmceeFile(MCMCIO, BaseInferenceFile): """Class to handle file IO for the ``emcee`` sampler.""" name = 'emcee_file' @@ -69,5 +70,3 @@ def write_acceptance_fraction(self, acceptance_fraction): except KeyError: # dataset doesn't exist yet, create it self[group] = acceptance_fraction - - From ac6d5148d022ffe0e5f103f37fcf672b85a152ea Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Tue, 31 Jul 2018 10:37:49 +0200 Subject: [PATCH 32/47] fix typo --- gwin/burn_in.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gwin/burn_in.py b/gwin/burn_in.py index f7dca6a..afd91a2 100644 --- a/gwin/burn_in.py +++ b/gwin/burn_in.py @@ -316,7 +316,7 @@ def from_config(cls, cp, sampler): kwargs = {} if cp.has_option_tag(section, 'nacl', tag): kwargs['nacl'] = int(cp.get_opt_tag(section, 'nacl', tag)) - if cp.has_option_tag(section, 'ks-threshold', tag) + if cp.has_option_tag(section, 'ks-threshold', tag): kwargs['ks_threshold'] = float( cp.get_opt_tag(section, 'ks-threshold', tag)) return cls(sampler, burn_in_test, **kwargs) From 23366e3eb062f1b357ccd23003a6b88a44c78f13 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Tue, 31 Jul 2018 10:45:49 +0200 Subject: [PATCH 33/47] fix whitespace --- gwin/models/gaussian_noise.py | 2 +- gwin/sampler/base.py | 25 +++++++++++++------------ gwin/sampler/base_mcmc.py | 10 ++++++---- gwin/sampler/emcee.py | 8 ++++---- gwin/sampler/emcee_pt.py | 4 +++- 5 files changed, 27 insertions(+), 22 deletions(-) diff --git a/gwin/models/gaussian_noise.py b/gwin/models/gaussian_noise.py index c04dd4c..645dbb1 100644 --- a/gwin/models/gaussian_noise.py +++ b/gwin/models/gaussian_noise.py @@ -439,7 +439,7 @@ def det_optimal_snrsq(self, det): def write_metadata(self, fp): """Adds writing the psds and lognl, since it's a constant. - + The lognl is written to the sample group's ``attrs``. """ super(GaussianNoise, self).write_data(fp) diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py index e0cb543..f0f3b48 100644 --- a/gwin/sampler/base.py +++ b/gwin/sampler/base.py @@ -55,7 +55,7 @@ class BaseSampler(object): def __init__(self, model): self.model = model - #@classmethod # uncomment when we move to python 3.3 + # @classmethod <--uncomment when we move to python 3.3 @abstractmethod def from_config(cls, cp, model, nprocesses=1, use_mpi=False, **kwargs): @@ -85,7 +85,7 @@ def static_params(self): def samples(self): """A dict mapping variable_params to arrays of samples currently in memory. The dictionary may also contain sampling_params. - + The sample arrays may have any shape, and may or may not be thinned. """ pass @@ -102,7 +102,7 @@ def model_stats(self): @abstractmethod def run(self): """This function should run the sampler. - + Any checkpointing should be done internally in this function. """ pass @@ -111,7 +111,7 @@ def run(self): def io(self): """A class that inherits from ``BaseInferenceFile`` to handle IO with an hdf file. - + This should be a class, not an instance of class, so that the sampler can initialize it when needed. """ @@ -121,7 +121,7 @@ def io(self): def set_initial_conditions(self, initial_distribution=None, samples_file=None): """Sets up the starting point for the sampler. - + Should also set the sampler's random state. """ pass @@ -144,7 +144,7 @@ def write_metadata(self, fp): # write the model's metadata self.model.write_metadata(fp) self._write_more_metadata(fp) - + def _write_more_metadata(self, fp): """Optional method that can be implemented if a sampler wants to write more metadata than just its name and the model's metadata. @@ -160,7 +160,7 @@ def setup_output(self, output_file, force=False, injection_file=None): If the output file already exists, an ``OSError`` will be raised. This can be overridden by setting ``force`` to ``True``. - + Parameters ---------- sampler : sampler instance @@ -231,7 +231,7 @@ def create_new_output_file(sampler, filename, force=False, injection_file=None, If the output file already exists, an ``OSError`` will be raised. This can be overridden by setting ``force`` to ``True``. - + Parameters ---------- sampler : sampler instance @@ -285,10 +285,11 @@ def intial_dist_from_config(cp): "than the prior.") initial_dists = distributions.read_distributions_from_config( cp, section="initial") - constraints = distributions.read_constraints_from_config(cp, - constraint_section="initial_constraint") - init_dist = distributions.JointDistribution(sampler.variable_params, - *initial_dists, **{"constraints" : constraints}) + constraints = distributions.read_constraints_from_config( + cp, constraint_section="initial_constraint") + init_dist = distributions.JointDistribution( + sampler.variable_params, *initial_dists, + **{"constraints": constraints}) else: init_dist = None return init_dist diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index 954e40f..71b6f8b 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -34,6 +34,8 @@ # # ============================================================================= # + + def raw_samples_to_dict(sampler, raw_samples): """Convenience function for converting ND array to a dict of samples. @@ -131,11 +133,11 @@ class BaseMCMC(object): _nwalkers = None _burn_in = None - @abstractproperty(self): + @abstractproperty def base_shape(self): - """What shape the sampler's samples arrays are in, excluding + """What shape the sampler's samples arrays are in, excluding the iterations dimension. - + For example, if a sampler uses 20 walkers and 3 temperatures, this would be ``(3, 20)``. If a sampler only uses a single walker and no temperatures this would be ``()``. @@ -173,7 +175,7 @@ def pos(self): @property def p0(self): """The starting position of the walkers in the sampling param space. - + The returned object is a dict mapping the sampling parameters to the values. """ diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py index 34a09f2..a16cde9 100644 --- a/gwin/sampler/emcee.py +++ b/gwin/sampler/emcee.py @@ -37,7 +37,7 @@ from .base import BaseSampler from .base_mcmc import (BaseMCMC, raw_samples_to_dict, raw_stats_to_dict) -from ../ import burn_in +from gwin import burn_in # @@ -111,7 +111,7 @@ def base_shape(self): def samples(self): """A dict mapping ``variable_params`` to arrays of samples currently in memory. - + The arrays have shape ``nwalkers x niterations``. """ # emcee stores samples to it's chain attribute as a @@ -122,7 +122,7 @@ def samples(self): @property def model_stats(self): """A dict mapping the model's ``default_stats`` to arrays of values. - + The returned array has shape ``nwalkers x niterations``. """ return raw_samples_to_dict(self._sampler.blobs, raw_stats) @@ -213,7 +213,7 @@ def from_config(cls, cp, model, nprocesses=1, use_mpi=False): else: lnpost = None obj = cls(model, nwalkers, logpost_function=lnpost, - nprocesses=nprocesses, use_mpi=use_mpi) + nprocesses=nprocesses, use_mpi=use_mpi) # add burn-in if it's specified try: bit = obj.burn_in_class.from_config(cp, obj) diff --git a/gwin/sampler/emcee_pt.py b/gwin/sampler/emcee_pt.py index 8cb6605..cef83fd 100644 --- a/gwin/sampler/emcee_pt.py +++ b/gwin/sampler/emcee_pt.py @@ -26,9 +26,11 @@ packages for parameter estimation. """ -# This is needed for two reason +# The following two classes are needed for two reason # 1) pools freeze state when created and so classes *cannot be updated* # 2) methods cannot be pickled. + + class _callprior(object): """Calls the model's prior function, and ensures that no metadata is returned.""" From 60d0e75ca2ddf1edc8b44f30cdf889a6df9aa347 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Tue, 31 Jul 2018 11:48:05 +0200 Subject: [PATCH 34/47] write filetype to inference hdf files; provide a loadfile function --- gwin/io/__init__.py | 44 +++++++++++++++++++++++++++++++++- gwin/io/base_hdf.py | 58 ++++++++++++++++++++++----------------------- 2 files changed, 72 insertions(+), 30 deletions(-) diff --git a/gwin/io/__init__.py b/gwin/io/__init__.py index 2e19621..8b78ce3 100644 --- a/gwin/io/__init__.py +++ b/gwin/io/__init__.py @@ -17,5 +17,47 @@ """I/O utilities for GWIn """ -from .hdf import InferenceFile +import h5py as _h5py +from .emcee import EmceeFile from .txt import InferenceTXTFile + +filetypes = { + EmceeFile.name: EmceeFile, +} + + +def loadfile(path, mode=None, filetype=None, **kwargs): + """Loads the given file using the appropriate InferenceFile class. + + If ``filetype`` is not provided, this will try to retreive the ``filetype`` + from the file's ``attrs``. If the file does not exist yet, an IOError will + be raised if ``filetype`` is not provided. + + Parameters + ---------- + path : str + The filename to load. + mode : str, optional + What mode to load the file with, e.g., 'w' for write, 'r' for read, + 'a' for append. Default will default to h5py.File's mode, which is 'a'. + filetype : str, optional + Force the file to be loaded with the given class name. This must be + provided if creating a new file. + + Returns + ------- + filetype instance + An open file handler to the file. The class used for IO with the file + is determined by the ``filetype`` keyword (if provided) or the + ``filetype`` stored in the file (if not provided). + """ + if filetype is None: + # try to read the file to get its filetype + try: + with _h5py.File(path, 'r') as fp: + filetype = fp.attrs['filetype'] + except IOError: + # file doesn't exist, filetype must be provided + raise IOError("The file appears not to exist. In this case, " + "filetype must be provided.") + return filetypes[filetype](path, mode=mode, **kwargs) diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index e3d9a00..75d6c73 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -64,7 +64,23 @@ class BaseInferenceFile(h5py.File): injections_group = 'injections' def __init__(self, path, mode=None, **kwargs): - super(BaseInferenceFile, self).__init__(path, mode, **kwargs) + fp = super(BaseInferenceFile, self).__init__(path, mode, **kwargs) + # check that file type matches self + try: + filetype = fp.attrs['filetype'] + except KeyError: + if mode == 'w': + # first time creating the file, add this class's name + filetype = self.name + fp.attrs['filetype'] = filetype + else: + filetype = None + if filetype != self.name: + raise ValueError("This file has filetype {}, whereas this class " + "is named {}. This indicates that the file was " + "not written by this class, and so cannot be " + "read by this class.".format(filetype, self.name)) + return fp def __getattr__(self, attr): """Things stored in ``.attrs`` are promoted to instance attributes. @@ -444,42 +460,26 @@ def get_slice(self, thin_start=None, thin_interval=None, thin_end=None): Parameters ---------- - thin_start : {None, int} - The starting index to use. If None, will try to retrieve the - `burn_in_iterations` from the given file. If no - `burn_in_iterations` exists, will default to the start of the - array. - thin_interval : {None, int} - The interval to use. If None, will try to retrieve the acl from the - given file. If no acl attribute exists, will default to 1. - thin_end : {None, int} - The end index to use. If None, will retrieve to the end of the - array. + thin_start : int, optional + The starting index to use. If None, will use the ``thin_start`` + attribute. + thin_interval : int, optional + The interval to use. If None, will use the ``thin_interval`` + attribute. + thin_end : int, optional + The end index to use. If None, will use the ``thin_end`` attribute. Returns ------- slice : The slice needed. """ - - # default is to skip burn in samples if thin_start is None: - try: - thin_start = self.burn_in_iterations - # if the sampler hasn't burned in, the burn_in_iterations will - # be the same as the number of iterations, which would result - # in 0 samples. In that case, just use the last one - if thin_start == self.niterations: - thin_start = thin_start - 1 - except KeyError: - pass - - # default is to use stored ACL and accept every i-th sample + thin_start = self.thin_start if thin_interval is None: - try: - thin_interval = int(numpy.ceil(self.acl)) - except KeyError: - pass + thin_interval = self.thin_interval + if thin_end is None: + thin_end = self.thin_end return slice(thin_start, thin_end, thin_interval) def copy_metadata(self, other): From 704d417c21e15c5ce01ebc0d75314d4492b5283e Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Tue, 31 Jul 2018 11:48:49 +0200 Subject: [PATCH 35/47] fix some import errors --- gwin/sampler/base_mcmc.py | 4 ++-- gwin/sampler/emcee.py | 11 +++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index 71b6f8b..4dc5c6d 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -23,7 +23,7 @@ # """Provides constructor classes and convenience functions for MCMC samplers.""" -from abc import ABCMeta, abstractmethod, abstractproperty +from abc import (ABCMeta, abstractmethod, abstractproperty) import logging import numpy @@ -393,7 +393,7 @@ def compute_acl(cls, filename, **kwargs): pass -class EnsembleMCMCAutocorrSupport(object): +class MCMCAutocorrSupport(object): """Provides class methods for calculating ensemble ACFs/ACLs. """ diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py index a16cde9..d935cb3 100644 --- a/gwin/sampler/emcee.py +++ b/gwin/sampler/emcee.py @@ -30,14 +30,13 @@ import numpy import emcee -from pycbc.io import FieldArray -from pycbc.filter import autocorrelation from pycbc.pool import choose_pool from pycbc.workflow import ConfigParser from .base import BaseSampler -from .base_mcmc import (BaseMCMC, raw_samples_to_dict, raw_stats_to_dict) -from gwin import burn_in +from .base_mcmc import (BaseMCMC, MCMCAutocorrSupport, raw_samples_to_dict, + raw_stats_to_dict) +from gwin.burn_in import MCMCBurnInTests # @@ -48,7 +47,7 @@ # ============================================================================= # -class EmceeEnsembleSampler(EnsembleMCMCAutocorrSupport, BaseMCMC, BaseSampler): +class EmceeEnsembleSampler(MCMCAutocorrSupport, BaseMCMC, BaseSampler): """This class is used to construct an MCMC sampler from the emcee package's EnsembleSampler. @@ -65,7 +64,7 @@ class EmceeEnsembleSampler(EnsembleMCMCAutocorrSupport, BaseMCMC, BaseSampler): """ name = "emcee" _io = EmceeFile - burn_in_class = burn_in.MCMCBurnInTests + burn_in_class = MCMCBurnInTests def __init__(self, model, nwalkers, logpost_function=None, nprocesses=1, use_mpi=False): From adee9c3e6d13c0d0ab6d2026f785a9f9cd269abb Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Tue, 31 Jul 2018 13:46:42 +0200 Subject: [PATCH 36/47] remove sampler_class from io to avoid circular imports --- gwin/io/base_hdf.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index 75d6c73..fa3f86c 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -25,6 +25,8 @@ inference samplers generate. """ +from __future__ import absolute_import + import os import sys import logging @@ -39,8 +41,6 @@ from pycbc.types import FrequencySeries from pycbc.waveform import parameters as wfparams -from .. import sampler as gwin_sampler - class BaseInferenceFile(h5py.File): """Base class for all inference hdf files. @@ -205,15 +205,6 @@ def write_posterior(self, posterior_file, **kwargs): """ pass - @property - def sampler_class(self): - """Returns the sampler class that was used.""" - try: - sampler = self.sampler_name - except KeyError: - return None - return gwin_sampler.samplers[sampler] - @property def static_params(self): """Returns a dictionary of the static_params. The keys are the argument From 36a5e75b66e580f3dff4129342b3ee24b78295c2 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Tue, 31 Jul 2018 22:19:54 +0200 Subject: [PATCH 37/47] fix bugs --- bin/gwin | 38 +++++----- gwin/io/__init__.py | 144 +++++++++++++++++++++++++++++++++++++- gwin/io/base_hdf.py | 58 ++------------- gwin/io/base_mcmc.py | 44 +++++------- gwin/io/emcee.py | 5 +- gwin/models/base.py | 2 +- gwin/option_utils.py | 82 ---------------------- gwin/sampler/__init__.py | 40 +++++++++-- gwin/sampler/base.py | 16 +++-- gwin/sampler/base_mcmc.py | 80 ++++++++++++--------- gwin/sampler/emcee.py | 39 ++++++----- 11 files changed, 302 insertions(+), 246 deletions(-) diff --git a/bin/gwin b/bin/gwin index 9bf822c..26ce45b 100644 --- a/bin/gwin +++ b/bin/gwin @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright (C) 2016 Christopher M. Biwer +# Copyright (C) 2016 Christopher M. Biwer, Collin Capano # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the @@ -32,8 +32,6 @@ from pycbc.waveform import generator import gwin from gwin import (__version__, burn_in, option_utils) -from gwin.io.hdf import InferenceFile -from gwin.option_utils import validate_checkpoint_files from gwin.calibration import Recalibrate # command line usage @@ -53,6 +51,12 @@ parser.add_argument("--save-backup", action="store_true", default=False, help="Don't delete the backup file after the run has " "completed.") +# parallelization options +parser.add_argument("--nprocesses", type=int, default=1, + help="Number of processes to use. If not given then only " + "a single core will be used.") +parser.add_argument("--use-mpi", action='store_true', default=False, + help="Use MPI to parallelize the sampler") # run duration options parser.add_argument("--nsamples", type=int, required=True, help="The number of samples the sampler should get. " @@ -143,8 +147,9 @@ with ctx: # get ifo-specific instances of calibration model if cp.has_section('calibration'): logging.info("Initializing calibration model") - recalibration = {ifo: Recalibrate.from_config(cp, ifo, section='calibration') for - ifo in opts.instruments} + recalibration = {ifo: Recalibrate.from_config(cp, ifo, + section='calibration') + for ifo in opts.instruments} model_args['recalibration'] = recalibration # get gates for templates @@ -157,10 +162,6 @@ with ctx: # construct class that will return the natural logarithm of likelihood model = gwin.models.read_from_config(cp, **model_args) - # FIXME: move to MCMC sampler - #burn_in_eval = burn_in.BurnIn(opts.burn_in_function, - # min_iterations=opts.min_burn_in) - logging.info("Setting up sampler") # Create sampler that will run. @@ -176,17 +177,18 @@ with ctx: # injection file all detectors. This # should be fixed in a future version of PyCBC. Once it is, # update this. Until then, just use the first file. - injection_file = opts.injection_file.values()[0] # None if not set + if opts.injection_file: + injection_file = opts.injection_file.values()[0] # None if not set + else: + injection_file = None sampler.setup_output(opts.output_file, force=opts.force, - injection_file=injetion_file) + injection_file=injection_file) - # set the walkers initial positions from a pre-existing InferenceFile - # or a specific initial distribution listed in the configuration file - # or else use the prior distributions to set initial positions - logging.info("Setting walkers initial conditions for varying parameters") + # Figure out where to get the initial conditions from: a samples file, + # the checkpoint file, the prior, or an initial prior. samples_file = opts.samples_file # use the checkpoint file instead if resume from checkpoint - if sampler.checkpoint_valid: + if not sampler.new_checkpoint: samples_file = sampler.checkpoint_file if samples_file is not None: logging.info("Initial positions taken from last iteration in %s", @@ -194,9 +196,9 @@ with ctx: init_prior = None else: # try to load an initial distribution from the config file - init_prior = gwin.sampler.inital_dist_from_config(cp) + init_prior = gwin.sampler.initial_dist_from_config(cp) - sampler.set_initial_conditions(intial_distribution=init_prior, + sampler.set_initial_conditions(initial_distribution=init_prior, samples_file=samples_file) # Set the target number of samples for the sampler diff --git a/gwin/io/__init__.py b/gwin/io/__init__.py index 8b78ce3..ea519eb 100644 --- a/gwin/io/__init__.py +++ b/gwin/io/__init__.py @@ -17,7 +17,13 @@ """I/O utilities for GWIn """ +from __future__ import absolute_import + +import os +import shutil +import logging import h5py as _h5py + from .emcee import EmceeFile from .txt import InferenceTXTFile @@ -25,7 +31,6 @@ EmceeFile.name: EmceeFile, } - def loadfile(path, mode=None, filetype=None, **kwargs): """Loads the given file using the appropriate InferenceFile class. @@ -61,3 +66,140 @@ def loadfile(path, mode=None, filetype=None, **kwargs): raise IOError("The file appears not to exist. In this case, " "filetype must be provided.") return filetypes[filetype](path, mode=mode, **kwargs) + +# +# ============================================================================= +# +# HDF Utilities +# +# ============================================================================= +# + + +def check_integrity(filename): + """Checks the integrity of an InferenceFile. + + Checks done are: + + * can the file open? + * do all of the datasets in the samples group have the same shape? + * can the first and last sample in all of the datasets in the samples + group be read? + + If any of these checks fail, an IOError is raised. + + Parameters + ---------- + filename: str + Name of an InferenceFile to check. + + Raises + ------ + ValueError + If the given file does not exist. + KeyError + If the samples group does not exist. + IOError + If any of the checks fail. + """ + # check that the file exists + if not os.path.exists(filename): + raise ValueError("file {} does not exist".format(filename)) + # if the file is corrupted such that it cannot be opened, the next line + # will raise an IOError + with loadfile(filename, 'r') as fp: + # check that all datasets in samples have the same shape + parameters = fp[fp.samples_group].keys() + group = fp.samples_group + '/{}' + # use the first parameter as a reference shape + ref_shape = fp[group.format(parameters[0])].shape + if not all(fp[group.format(param)].shape == ref_shape + for param in parameters): + raise IOError("not all datasets in the samples group have the " + "same shape") + # check that we can read the first/last sample + firstidx = tuple([0]*len(ref_shape)) + lastidx = tuple([-1]*len(ref_shape)) + for param in parameters: + fp[group.format(param)][firstidx] + fp[group.format(param)][lastidx] + + +def validate_checkpoint_files(checkpoint_file, backup_file): + """Checks if the given checkpoint and/or backup files are valid. + + The checkpoint file is considered valid if: + + * it passes all tests run by ``check_integrity``; + * it has at least one sample written to it (indicating at least one + checkpoint has happened). + + The same applies to the backup file. The backup file must also have the + same number of samples as the checkpoint file, otherwise, the backup is + considered invalid. + + If the checkpoint (backup) file is found to be valid, but the backup + (checkpoint) file is not valid, then the checkpoint (backup) is copied to + the backup (checkpoint). Thus, this function ensures that checkpoint and + backup files are either both valid or both invalid. + + Parameters + ---------- + checkpoint_file : string + Name of the checkpoint file. + backup_file : string + Name of the backup file. + + Returns + ------- + checkpoint_valid : bool + Whether or not the checkpoint (and backup) file may be used for loading + samples. + """ + # check if checkpoint file exists and is valid + try: + check_integrity(checkpoint_file) + checkpoint_valid = True + except (ValueError, KeyError, IOError): + checkpoint_valid = False + # backup file + try: + check_integrity(backup_file) + backup_valid = True + except (ValueError, KeyError, IOError): + backup_valid = False + # check if there are any samples in the file; if not, we'll just start from + # scratch + if checkpoint_valid: + with loadfile(checkpoint_file, 'r') as fp: + try: + group = '{}/{}'.format(fp.samples_group, fp.variable_params[0]) + nsamples = fp[group].size + checkpoint_valid = nsamples != 0 + except KeyError: + checkpoint_valid = False + # check if there are any samples in the backup file + if backup_valid: + with loadfile(backup_file, 'r') as fp: + try: + group = '{}/{}'.format(fp.samples_group, fp.variable_params[0]) + backup_nsamples = fp[group].size + backup_valid = backup_nsamples != 0 + except KeyError: + backup_valid = False + # check that the checkpoint and backup have the same number of samples; + # if not, assume the checkpoint has the correct number + if checkpoint_valid and backup_valid: + backup_valid = nsamples == backup_nsamples + # decide what to do based on the files' statuses + if checkpoint_valid and not backup_valid: + # copy the checkpoint to the backup + logging.info("Backup invalid; copying checkpoint file") + shutil.copy(checkpoint_file, backup_file) + backup_valid = True + elif backup_valid and not checkpoint_valid: + logging.info("Checkpoint invalid; copying backup file") + # copy the backup to the checkpoint + shutil.copy(backup_file, checkpoint_file) + checkpoint_valid = True + return checkpoint_valid diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index fa3f86c..8355ecb 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -64,15 +64,15 @@ class BaseInferenceFile(h5py.File): injections_group = 'injections' def __init__(self, path, mode=None, **kwargs): - fp = super(BaseInferenceFile, self).__init__(path, mode, **kwargs) + super(BaseInferenceFile, self).__init__(path, mode, **kwargs) # check that file type matches self try: - filetype = fp.attrs['filetype'] + filetype = self.attrs['filetype'] except KeyError: if mode == 'w': # first time creating the file, add this class's name filetype = self.name - fp.attrs['filetype'] = filetype + self.attrs['filetype'] = filetype else: filetype = None if filetype != self.name: @@ -80,7 +80,6 @@ def __init__(self, path, mode=None, **kwargs): "is named {}. This indicates that the file was " "not written by this class, and so cannot be " "read by this class.".format(filetype, self.name)) - return fp def __getattr__(self, attr): """Things stored in ``.attrs`` are promoted to instance attributes. @@ -314,7 +313,7 @@ def write_random_state(self, group=None, state=None): if state is None: state = numpy.random.get_state() s, arr, pos, has_gauss, cached_gauss = state - if group in self: + if dataset_name in self: self[dataset_name][:] = arr else: self.create_dataset(dataset_name, arr.shape, fletcher32=True, @@ -635,52 +634,3 @@ def write_kwargs_to_hdf_attrs(attrs, **kwargs): write_kwargs_to_hdf_attrs(attrs, **val) else: attrs[arg] = val - - -def check_integrity(filename): - """Checks the integrity of an InferenceFile. - - Checks done are: - - * can the file open? - * do all of the datasets in the samples group have the same shape? - * can the first and last sample in all of the datasets in the samples - group be read? - - If any of these checks fail, an IOError is raised. - - Parameters - ---------- - filename: str - Name of an InferenceFile to check. - - Raises - ------ - ValueError - If the given file does not exist. - KeyError - If the samples group does not exist. - IOError - If any of the checks fail. - """ - # check that the file exists - if not os.path.exists(filename): - raise ValueError("file {} does not exist".format(filename)) - # if the file is corrupted such that it cannot be opened, the next line - # will raise an IOError - with InferenceFile(filename, 'r') as fp: - # check that all datasets in samples have the same shape - parameters = fp[fp.samples_group].keys() - group = fp.samples_group + '/{}' - # use the first parameter as a reference shape - ref_shape = fp[group.format(parameters[0])].shape - if not all(fp[group.format(param)].shape == ref_shape - for param in parameters): - raise IOError("not all datasets in the samples group have the " - "same shape") - # check that we can read the first/last sample - firstidx = tuple([0]*len(ref_shape)) - lastidx = tuple([-1]*len(ref_shape)) - for param in parameters: - fp[group.format(param)][firstidx] - fp[group.format(param)][lastidx] diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py index ef834dc..d2087d4 100644 --- a/gwin/io/base_mcmc.py +++ b/gwin/io/base_mcmc.py @@ -24,24 +24,12 @@ """Provides I/O that is specific to MCMC samplers. """ -import os -import sys -import logging -from abc import ABCMeta +from abc import (ABCMeta, abstractmethod) import numpy -import h5py -from pycbc import DYN_RANGE_FAC -from pycbc.io import FieldArray -from pycbc.types import FrequencySeries -from pycbc.waveform import parameters as wfparams - -from .hdf import InferenceFile - - -class MCMCIO(obect): +class MCMCIO(object): """Abstract base class that provides some IO functions for ensemble MCMCs. """ __metaclass__ = ABCMeta @@ -82,25 +70,27 @@ def write_samples(self, samples, parameters=None, h5py. """ nwalkers, niterations = samples.values()[0].shape - assert(all(p.shape == (nwalkers, niterations) - for p in samples.values()), + assert all(p.shape == (nwalkers, niterations) + for p in samples.values()), ( "all samples must have the same shape") if max_iterations is not None and max_iterations < niterations: raise IndexError("The provided max size is less than the " "number of iterations") group = self.samples_group + '/{name}' + if parameters is None: + parameters = samples.keys() # loop over number of dimensions for param in parameters: dataset_name = group.format(name=param) istart = start_iteration try: - fp_niterations = fp[dataset_name].shape[-1] + fp_niterations = self[dataset_name].shape[-1] if istart is None: istart = fp_niterations istop = istart + niterations if istop > fp_niterations: # resize the dataset - fp[dataset_name].resize(istop, axis=1) + self[dataset_name].resize(istop, axis=1) except KeyError: # dataset doesn't exist yet if istart is not None and istart != 0: @@ -108,10 +98,10 @@ def write_samples(self, samples, parameters=None, "but dataset doesn't exist yet") istart = 0 istop = istart + niterations - fp.create_dataset(dataset_name, (nwalkers, istop), - maxshape=(nwalkers, max_iterations), - dtype=float, fletcher32=True) - fp[dataset_name][:, istart:istop] = samples[param] + self.create_dataset(dataset_name, (nwalkers, istop), + maxshape=(nwalkers, max_iterations), + dtype=float, fletcher32=True) + self[dataset_name][:, istart:istop] = samples[param] def read_raw_samples(self, fields, thin_start=None, thin_interval=None, thin_end=None, @@ -139,16 +129,14 @@ def read_raw_samples(self, fields, if iteration is not None: get_index = iteration else: - if thin_end is None: - # use the number of current iterations - thin_end = fp.niterations - get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end, - thin_interval=thin_interval) + get_index = self.get_slice(thin_start=thin_start, + thin_end=thin_end, + thin_interval=thin_interval) # load group = self.samples_group + '/{name}' arrays = {} for name in fields: - arr = fp[group.format(name=name)][widx, get_index] + arr = self[group.format(name=name)][widx, get_index] if flatten: arr = arr.flatten() arrays[name] = arr diff --git a/gwin/io/emcee.py b/gwin/io/emcee.py index e2ad663..8331226 100644 --- a/gwin/io/emcee.py +++ b/gwin/io/emcee.py @@ -25,7 +25,7 @@ """ from .base_hdf import BaseInferenceFile -from .base_mcmc import EnsembleMCMCIO +from .base_mcmc import MCMCIO class EmceeFile(MCMCIO, BaseInferenceFile): @@ -70,3 +70,6 @@ def write_acceptance_fraction(self, acceptance_fraction): except KeyError: # dataset doesn't exist yet, create it self[group] = acceptance_fraction + + def write_posterior(self, filename, **kwargs): + pass diff --git a/gwin/models/base.py b/gwin/models/base.py index d5a3d5e..9c4598c 100644 --- a/gwin/models/base.py +++ b/gwin/models/base.py @@ -746,7 +746,7 @@ def from_config(cls, cp, **kwargs): def write_metadata(self, fp): """Writes metadata to the given file handler.""" - fp.attrs['model'] = sampler.model.name + fp.attrs['model'] = self.name fp.attrs['variable_params'] = list(self.variable_params) fp.attrs['sampling_params'] = list(self.sampling_params) write_kwargs_to_hdf_attrs(fp.attrs, static_params=self.static_params) diff --git a/gwin/option_utils.py b/gwin/option_utils.py index 5fe539e..47ff79c 100644 --- a/gwin/option_utils.py +++ b/gwin/option_utils.py @@ -191,86 +191,6 @@ def sampler_from_cli(opts, model, pool=None): # # ----------------------------------------------------------------------------- -def validate_checkpoint_files(checkpoint_file, backup_file): - """Checks if the given checkpoint and/or backup files are valid. - - The checkpoint file is considered valid if: - - * it passes all tests run by ``InferenceFile.check_integrity``; - * it has at least one sample written to it (indicating at least one - checkpoint has happened). - - The same applies to the backup file. The backup file must also have the - same number of samples as the checkpoint file, otherwise, the backup is - considered invalid. - - If the checkpoint (backup) file is found to be valid, but the backup - (checkpoint) file is not valid, then the checkpoint (backup) is copied to - the backup (checkpoint). Thus, this function ensures that checkpoint and - backup files are either both valid or both invalid. - - Parameters - ---------- - checkpoint_file : string - Name of the checkpoint file. - backup_file : string - Name of the backup file. - - Returns - ------- - checkpoint_valid : bool - Whether or not the checkpoint (and backup) file may be used for loading - samples. - """ - # check if checkpoint file exists and is valid - logging.info("Validating checkpoint and backup files") - try: - check_integrity(checkpoint_file) - checkpoint_valid = True - except (ValueError, KeyError, IOError): - checkpoint_valid = False - # backup file - try: - check_integrity(backup_file) - backup_valid = True - except (ValueError, KeyError, IOError): - backup_valid = False - # check if there are any samples in the file; if not, we'll just start from - # scratch - if checkpoint_valid: - with InferenceFile(checkpoint_file, 'r') as fp: - try: - group = '{}/{}'.format(fp.samples_group, fp.variable_params[0]) - nsamples = fp[group].size - checkpoint_valid = nsamples != 0 - except KeyError: - checkpoint_valid = False - # check if there are any samples in the backup file - if backup_valid: - with InferenceFile(backup_file, 'r') as fp: - try: - group = '{}/{}'.format(fp.samples_group, fp.variable_params[0]) - backup_nsamples = fp[group].size - backup_valid = backup_nsamples != 0 - except KeyError: - backup_valid = False - # check that the checkpoint and backup have the same number of samples; - # if not, assume the checkpoint has the correct number - if checkpoint_valid and backup_valid: - backup_valid = nsamples == backup_nsamples - # decide what to do based on the files' statuses - if checkpoint_valid and not backup_valid: - # copy the checkpoint to the backup - logging.info("Backup invalid; copying checkpoint file") - shutil.copy(checkpoint_file, backup_file) - backup_valid = True - elif backup_valid and not checkpoint_valid: - logging.info("Checkpoint invalid; copying backup file") - # copy the backup to the checkpoint - shutil.copy(backup_file, checkpoint_file) - checkpoint_valid = True - return checkpoint_valid - def add_low_frequency_cutoff_opt(parser): """Adds the low-frequency-cutoff option to the given parser.""" @@ -325,7 +245,6 @@ def data_from_cli(opts): precision="double") # apply gates if not waiting to overwhiten if not opts.gate_overwhitened: - logging.info("Applying gates to strain data") strain_dict = apply_gates_to_td(strain_dict, gates) # get strain time series to use for PSD estimation @@ -350,7 +269,6 @@ def data_from_cli(opts): # FFT strain and save each of the length of the FFT, delta_f, and # low frequency cutoff to a dict - logging.info("FFT strain") stilde_dict = {} length_dict = {} delta_f_dict = {} diff --git a/gwin/sampler/__init__.py b/gwin/sampler/__init__.py index 6154aee..5b6e435 100644 --- a/gwin/sampler/__init__.py +++ b/gwin/sampler/__init__.py @@ -17,14 +17,42 @@ This modules provides a list of implemented samplers for parameter estimation. """ -from .kombine import KombineSampler -from .emcee import (EmceeEnsembleSampler, EmceePTSampler) -from .mcmc import MCMCSampler +from __future__ import absolute_import + +from .base import (initial_dist_from_config, create_new_output_file) +# from .kombine import KombineSampler +from .emcee import (EmceeEnsembleSampler, ) # EmceePTSampler) +# from .mcmc import MCMCSampler # list of available samplers samplers = {cls.name: cls for cls in ( - KombineSampler, + #KombineSampler, EmceeEnsembleSampler, - EmceePTSampler, - MCMCSampler, + #EmceePTSampler, + #MCMCSampler, )} + + +def load_from_config(cp, model, **kwargs): + """Loads a sampler from the given config file. + + This looks for a name in the section ``[sampler]`` to determine which + sampler class to load. That sampler's ``from_config`` is then called. + + Parameters + ---------- + cp : WorkflowConfigParser + Config parser to read from. + model : gwin.model + Which model to pass to the sampler. + \**kwargs : + All other keyword arguments are passed directly to the sampler's + ``from_config`` file. + + Returns + ------- + sampler : + The initialized sampler. + """ + name = cp.get('sampler', 'name') + return samplers[name].from_config(cp, model, **kwargs) diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py index f0f3b48..af041fa 100644 --- a/gwin/sampler/base.py +++ b/gwin/sampler/base.py @@ -26,12 +26,15 @@ """ from abc import ABCMeta, abstractmethod, abstractproperty +import os import numpy +import shutil from pycbc.io import FieldArray from pycbc.filter import autocorrelation import h5py import logging +from ..io import validate_checkpoint_files # # ============================================================================= @@ -176,20 +179,23 @@ def setup_output(self, output_file, force=False, injection_file=None): checkpoint_file = output_file + '.checkpoint' backup_file = output_file + '.bkup' # check if we have a good checkpoint and/or backup file + logging.info("Looking for checkpoint file") checkpoint_valid = validate_checkpoint_files(checkpoint_file, backup_file) # Create a new file if the checkpoint doesn't exist, or if it is # corrupted + self.new_checkpoint = False # keeps track if this is a new file or not if not checkpoint_valid: - self.create_new_output_file(checkpoint_file, force=force, - injection_file=injection_file) + logging.info("Checkpoint not found or not valid") + create_new_output_file(self, checkpoint_file, force=force, + injection_file=injection_file) # now the checkpoint is valid - checkpoint_valid = True + self.new_checkpoint = True # copy to backup shutil.copy(checkpoint_file, backup_file) # write the command line for fn in [checkpoint_file, backup_file]: - with sampler.io(fn, "a") as fp: + with self.io(fn, "a") as fp: fp.write_command_line() # store self.checkpoint_file = checkpoint_file @@ -263,7 +269,7 @@ def create_new_output_file(sampler, filename, force=False, injection_file=None, fp.write_injections(injection_file) -def intial_dist_from_config(cp): +def initial_dist_from_config(cp): """Loads a distribution for the sampler start from the given config file. A distribution will only be loaded if the config file has a [initial-*] diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index 4dc5c6d..6462ad8 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -23,9 +23,14 @@ # """Provides constructor classes and convenience functions for MCMC samplers.""" +from __future__ import absolute_import + from abc import (ABCMeta, abstractmethod, abstractproperty) import logging import numpy +from pycbc.filter import autocorrelation + +from ..io import validate_checkpoint_files # # ============================================================================= @@ -65,7 +70,10 @@ def raw_samples_to_dict(sampler, raw_samples): samples = sampler.model.prior_distribution.apply_boundary_conditions( **samples) # apply transforms to go to model's variable params space - return sampler.model.sampling_transforms.apply(samples, inverse=True) + if sampler.model.sampling_transforms is not None: + samples = sampler.model.sampling_transforms.apply( + samples, inverse=True) + return samples def raw_stats_to_dict(sampler, raw_stats): @@ -132,6 +140,7 @@ class BaseMCMC(object): _p0 = None _nwalkers = None _burn_in = None + _checkpoint_interval = None @abstractproperty def base_shape(self): @@ -155,13 +164,23 @@ def nwalkers(self): def niterations(self): """Get the current number of iterations.""" itercounter = self._itercounter - if _itercounter is None: + if itercounter is None: itercounter = 0 lastclear = self._lastclear if lastclear is None: lastclear = 0 return itercounter + lastclear + @property + def checkpoint_interval(self): + """The number of iterations to do between checkpoints.""" + return self._checkpoint_interval + + @abstractmethod + def clear_samples(self): + """A method to clear samples from memory.""" + pass + @property def pos(self): pos = self._pos @@ -209,19 +228,20 @@ def set_p0(self, samples_file=None, prior=None): samples = fp.read_samples(self.variable_params, iteration=-1) # make sure we have the same shape - assert(samples.shape[:-1] == self.samples_shape, + assert samples.shape == self.base_shape, ( "samples in file {} have shape {}, but I have shape {}". - format(samples_file, samples.shape, self.samples_shape)) + format(samples_file, samples.shape, self.base_shape)) # transform to sampling parameter space - samples = self.model.sampling_transforms.apply(samples) + if self.model.sampling_transforms is not None: + samples = self.model.sampling_transforms.apply(samples) # draw random samples if samples are not provided else: - nsamples = numpy.prod(self.samples_shape) + nsamples = numpy.prod(self.base_shape) samples = self.model.prior_rvs(size=nsamples, prior=prior).reshape( - self.samples_shape) - # store as ND array with shape [samples_shape] x nparams + self.base_shape) + # store as ND array with shape [base_shape] x nparams ndim = len(self.variable_params) - p0 = numpy.ones(list(self.samples_shape)+[ndim]) + p0 = numpy.ones(list(self.base_shape)+[ndim]) for i, param in enumerate(self.sampling_params): p0[..., i] = samples[param] self._p0 = p0 @@ -246,12 +266,6 @@ def set_state_from_file(self, filename): """ pass - @abstractmethod - def write_state(self, filename): - """Saves the state of the sampler to the given file. - """ - pass - def run(self): """Runs the sampler.""" @@ -266,7 +280,7 @@ def run(self): # contains (either due to sampler burn-in, or a previous checkpoint) try: with self.io(self.checkpoint_file, "r") as fp: - start = fp.niterations + startiter = fp.niterations except KeyError: startiter = 0 if self.require_indep_samples: @@ -312,7 +326,7 @@ def run(self): nsamples += iterinterval * self.nwalkers self._itercounter = startiter = enditer - @propetry + @property def burn_in(self): """The class for doing burn-in tests (if specified).""" return self._burn_in @@ -321,6 +335,7 @@ def set_burn_in(self, burn_in): """Sets the object to use for doing burn-in tests.""" self._burn_in = burn_in + @property def n_indep_samples(self): """The number of independent samples post burn-in that the sampler has acquired so far.""" @@ -360,25 +375,26 @@ def checkpoint(self): # it, in which case we don't need to do it again. if self.acls is None: logging.info("Computing acls") - self.acls = self.compute_acls(self.checkpoint_file) + self.acls = self.compute_acl(self.checkpoint_file) # write for fn in [self.checkpoint_file, self.backup_file]: with self.io(fn, "a") as fp: if self.burn_in is not None: fp.write_burn_in(self.burn_in) if self.acls is not None: - fp.write_acls(acls) + fp.write_acls(self.acls) # write the current number of iterations fp.attrs['niterations'] = self.niterations fp.attrs['n_indep_samples'] = self.n_indep_samples # check validity + logging.info("Validating checkpoint and backup files") checkpoint_valid = validate_checkpoint_files( self.checkpoint_file, self.backup_file) if not checkpoint_valid: raise IOError("error writing to checkpoint file") # clear the in-memory chain to save memory - logging.info("Clearing chain") - self.clear_chain() + logging.info("Clearing samples from memory") + self.clear_samples() @abstractmethod def compute_acf(cls, filename, **kwargs): @@ -398,8 +414,8 @@ class MCMCAutocorrSupport(object): """ @classmethod - def compute_acfs(cls, filename, start_index=None, end_index=None, - per_walker=False, walkers=None, parameters=None): + def compute_acf(cls, filename, start_index=None, end_index=None, + per_walker=False, walkers=None, parameters=None): """Computes the autocorrleation function of the model params in the given file. @@ -435,7 +451,7 @@ def compute_acfs(cls, filename, start_index=None, end_index=None, ``nwalkers x niterations``. """ acfs = {} - with cls.io(filename, 'r') as fp: + with cls._io(filename, 'r') as fp: if parameters is None: parameters = fp.variable_params if isinstance(parameters, str) or isinstance(parameters, unicode): @@ -446,15 +462,15 @@ def compute_acfs(cls, filename, start_index=None, end_index=None, if walkers is None: walkers = numpy.arange(fp.nwalkers) arrays = [ - cls.compute_acfs(filename, start_index=start_index, - end_index=end_index, - per_walker=False, walkers=ii, - parameters=param)[param] + cls.compute_acf(filename, start_index=start_index, + end_index=end_index, + per_walker=False, walkers=ii, + parameters=param)[param] for ii in walkers] acfs[param] = numpy.vstack(arrays) else: samples = fp.read_raw_samples( - fp, param, thin_start=start_index, thin_interval=1, + param, thin_start=start_index, thin_interval=1, thin_end=end_index, walkers=walkers, flatten=False)[param] samples = samples.mean(axis=0) @@ -463,7 +479,7 @@ def compute_acfs(cls, filename, start_index=None, end_index=None, return acfs @classmethod - def compute_acls(cls, filename, start_index=None, end_index=None): + def compute_acl(cls, filename, start_index=None, end_index=None): """Computes the autocorrleation length for all model params in the given file. @@ -489,10 +505,10 @@ def compute_acls(cls, filename, start_index=None, end_index=None): A dictionary giving the ACL for each parameter. """ acls = {} - with cls.io(filename, 'r') as fp: + with cls._io(filename, 'r') as fp: for param in fp.variable_params: samples = fp.read_raw_samples( - fp, param, thin_start=start_index, thin_interval=1, + param, thin_start=start_index, thin_interval=1, thin_end=end_index, flatten=False)[param] samples = samples.mean(axis=0) acl = autocorrelation.calculate_acl(samples) diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py index d935cb3..1888c67 100644 --- a/gwin/sampler/emcee.py +++ b/gwin/sampler/emcee.py @@ -36,7 +36,9 @@ from .base import BaseSampler from .base_mcmc import (BaseMCMC, MCMCAutocorrSupport, raw_samples_to_dict, raw_stats_to_dict) -from gwin.burn_in import MCMCBurnInTests +from ..burn_in import MCMCBurnInTests +from ..io import EmceeFile +from .. import models # @@ -66,8 +68,8 @@ class EmceeEnsembleSampler(MCMCAutocorrSupport, BaseMCMC, BaseSampler): _io = EmceeFile burn_in_class = MCMCBurnInTests - def __init__(self, model, nwalkers, logpost_function=None, - nprocesses=1, use_mpi=False): + def __init__(self, model, nwalkers, checkpoint_interval=None, + logpost_function=None, nprocesses=1, use_mpi=False): self.model = model # create a wrapper for calling the model @@ -93,6 +95,7 @@ def __init__(self, model, nwalkers, logpost_function=None, # to have the same state as the numpy generator rstate = numpy.random.get_state() self._sampler.random_state = rstate + self._checkpoint_interval = checkpoint_interval @property def io(self): @@ -124,7 +127,11 @@ def model_stats(self): The returned array has shape ``nwalkers x niterations``. """ - return raw_samples_to_dict(self._sampler.blobs, raw_stats) + raw_stats = numpy.array(self._sampler.blobs) + # raw_stats has shape niterations x nwalkers x nstats; transpose + # so that it has shape nwalkers x niterations x nstats + raw_stats = raw_stats.transpose((1, 0, 2)) + return raw_samples_to_dict(self, raw_stats) def clear_samples(self): """Clears the samples and stats from memory. @@ -154,20 +161,10 @@ def run_mcmc(self, niterations, **kwargs): Number of iterations to run the sampler for. \**kwargs : All other keyword arguments are passed to the emcee sampler. - - Returns - ------- - p : numpy.array - An array of current walker positions with shape (nwalkers, ndim). - lnpost : numpy.array - The list of log posterior probabilities for the walkers at - positions p, with shape (nwalkers, ndim). - rstate : - The current state of the random number generator. """ pos = self._pos if pos is None: - pos = self.p0 + pos = self._p0 res = self._sampler.run_mcmc(pos, niterations, **kwargs) p, _, _ = res[0], res[1], res[2] # update the positions @@ -207,16 +204,22 @@ def from_config(cls, cp, model, nprocesses=1, use_mpi=False): "name in section [sampler] must match mine") # get the number of walkers to use nwalkers = int(cp.get(section, "nwalkers")) + # get the checkpoint interval, if it's specified + if cp.has_option(section, "checkpoint-interval"): + checkpoint_interval = int(cp.get(section, "checkpoint-interval")) + else: + checkpoint_interval = None if cp.has_option(section, "logpost-function"): lnpost = cp.get(section, "logpost-function") else: lnpost = None - obj = cls(model, nwalkers, logpost_function=lnpost, - nprocesses=nprocesses, use_mpi=use_mpi) + obj = cls(model, nwalkers, checkpoint_interval=checkpoint_interval, + logpost_function=lnpost, nprocesses=nprocesses, + use_mpi=use_mpi) # add burn-in if it's specified try: bit = obj.burn_in_class.from_config(cp, obj) - except ConfigParser.NoSectionError: + except ConfigParser.Error: bit = None obj.set_burn_in(bit) return obj From e87158234c514c35ad018ec5d0cad53ae86e6bea Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Fri, 3 Aug 2018 10:58:01 +0200 Subject: [PATCH 38/47] fix bugs, move niterations/nsamples into config file --- bin/gwin | 15 ------ gwin/io/base_hdf.py | 28 ++++++++-- gwin/io/base_mcmc.py | 22 +++++++- gwin/sampler/base.py | 38 ++------------ gwin/sampler/base_mcmc.py | 104 ++++++++++++++++++++++++-------------- gwin/sampler/emcee.py | 17 +++++-- 6 files changed, 126 insertions(+), 98 deletions(-) diff --git a/bin/gwin b/bin/gwin index 26ce45b..cacded7 100644 --- a/bin/gwin +++ b/bin/gwin @@ -57,18 +57,6 @@ parser.add_argument("--nprocesses", type=int, default=1, "a single core will be used.") parser.add_argument("--use-mpi", action='store_true', default=False, help="Use MPI to parallelize the sampler") -# run duration options -parser.add_argument("--nsamples", type=int, required=True, - help="The number of samples the sampler should get. " - "The sampler will run until it has acquired at least " - "this many samples. Depending on checkpoint settings " - "it may go over.") -parser.add_argument("--require-indep-samples", action="store_true", - default=False, - help="Require that the number of samples set by nsamples " - "be independent. If this is not set, MCMC samplers " - "will just run until they have the desried number of " - "raw samples (with no thinning).") parser.add_argument("--samples-file", default=None, help="Use an iteration from an InferenceFile as the " "initial proposal distribution. The same " @@ -201,9 +189,6 @@ with ctx: sampler.set_initial_conditions(initial_distribution=init_prior, samples_file=samples_file) - # Set the target number of samples for the sampler - sampler.set_target(opts.nsamples, opts.require_indep_samples) - # Run the sampler sampler.run() diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index 8355ecb..f52bf02 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -107,6 +107,14 @@ def write_samples(self, samples, **kwargs): """ pass + @abstractmethod + def write_sampler_metadata(self, sampler): + """This should write the given sampler's metadata to the file. + + This should also include the model's metadata. + """ + pass + def parse_parameters(self, parameters, array_class=None): """Parses a parameters arg to figure out what fields need to be loaded. @@ -212,14 +220,18 @@ def static_params(self): return {arg: self.attrs[arg] for arg in self.attrs["static_params"]} @property - def n_indep_samples(self): - """Returns the number of independent samples stored in the file. + def effective_nsamples(self): + """Returns the effective number of samples stored in the file. """ try: - return self.attrs['n_indep_samples'] + return self.attrs['effective_nsamples'] except KeyError: return 0 + def write_effective_nsamples(self, effective_nsamples): + """Writes the effective number of samples stored in the file.""" + self.attrs['effective_nsamples'] = effective_nsamples + @property def thin_start(self): """The default start index to use when reading samples. @@ -444,6 +456,16 @@ def write_command_line(self): previous = [] self.attrs["cmd"] = cmd + previous + @abstractmethod + def write_resume_point(self): + """Should write the point that a sampler starts up. + + How the resume point is indexed is up to the sampler. For example, + MCMC samplers use the number of iterations that are stored in the + checkpoint file. + """ + pass + def get_slice(self, thin_start=None, thin_interval=None, thin_end=None): """Formats a slice using the given arguments that can be used to retrieve a thinned array from an InferenceFile. diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py index d2087d4..d9a401a 100644 --- a/gwin/io/base_mcmc.py +++ b/gwin/io/base_mcmc.py @@ -156,6 +156,26 @@ def write_resume_point(self): resume_pts.append(niterations) self.attrs["resume_points"] = resume_pts + def write_niterations(self, niterations): + """Writes the given number of iterations to the sampler group.""" + self[self.sampler_group].attrs['niterations'] = niterations + + @property + def niterations(self): + """Returns the number of iterations the sampler was run for.""" + return self[self.sampler_group].attrs['niterations'] + + def write_sampler_metadata(self, sampler): + """Writes the sampler's metadata.""" + self.attrs['sampler'] = sampler.name + if self.sampler_group not in self.keys(): + # create the sampler group + self.create_group(self.sampler_group) + self[self.sampler_group].attrs['nwalkers'] = sampler.nwalkers + # write the model's metadata + sampler.model.write_metadata(self) + + def write_acls(self, acls): """Writes the given autocorrelation lengths. @@ -186,7 +206,7 @@ def write_acls(self, acls): self[group.format(param)] = acls[param] # write the maximum over all params acl = numpy.array(acls.values()).max() - self.attrs['acl'] = acl + self[self.sampler_group].attrs['acl'] = acl # set the default thin interval to be the acl self.attrs['thin_interval'] = acl diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py index af041fa..86971f0 100644 --- a/gwin/sampler/base.py +++ b/gwin/sampler/base.py @@ -141,19 +141,6 @@ def finalize(self): """Do any finalization to the samples file before exiting.""" pass - def write_metadata(self, fp): - """Writes metadata about the sampler to the given filehandler.""" - fp.attrs['sampler'] = self.name - # write the model's metadata - self.model.write_metadata(fp) - self._write_more_metadata(fp) - - def _write_more_metadata(self, fp): - """Optional method that can be implemented if a sampler wants to write - more metadata than just its name and the model's metadata. - """ - pass - def setup_output(self, output_file, force=False, injection_file=None): """Sets up the sampler's checkpoint and output files. @@ -193,35 +180,16 @@ def setup_output(self, output_file, force=False, injection_file=None): self.new_checkpoint = True # copy to backup shutil.copy(checkpoint_file, backup_file) - # write the command line + # write the command line, startup for fn in [checkpoint_file, backup_file]: with self.io(fn, "a") as fp: fp.write_command_line() + fp.write_resume_point() # store self.checkpoint_file = checkpoint_file self.backup_file = backup_file self.checkpoint_valid = checkpoint_valid - def set_target(self, nsamples, require_independent=False): - """Sets the number of samples the sampler should try to acquire. - - If the ``must_be_independent`` flag is set, then the number of samples - must be independent. This means, for example, that MCMC chains are - thinned by their ACL before counting samples. Otherwise, the sampler - will just run until it has the requested number of samples, regardless - of thinning. - - Parameters - ---------- - nsamples : int - The number of samples to acquire. - must_be_independent : bool, optional - Add the requirement that the target number of samples be - independent. Default is False. - """ - self.target_nsamples = nsamples - self.require_indep_samples = require_independent - # # ============================================================================= @@ -261,7 +229,7 @@ def create_new_output_file(sampler, filename, force=False, injection_file=None, logging.info("Creating file {}".format(filename)) with sampler.io(filename, "w") as fp: # save the sampler's metadata - sampler.write_metadata(fp) + fp.write_sampler_metadata(sampler) # save injection parameters if injection_file is not None: logging.info("Writing injection file to output") diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index 6462ad8..efaa16f 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -105,7 +105,7 @@ def raw_stats_to_dict(sampler, raw_stats): # therefore immediately convert this to a ND array. raw_stats = numpy.array(raw_stats) return {stat: raw_stats[..., ii] - for (ii, stat) in enumerate(self.model.default_stats)} + for (ii, stat) in enumerate(sampler.model.default_stats)} # # ============================================================================= @@ -134,13 +134,15 @@ class BaseMCMC(object): """ __metaclass__ = ABCMeta - _lastclear = None - _itercounter = None + _lastclear = None # the iteration when samples were cleared from memory + _itercounter = None # the number of iterations since the last clear _pos = None _p0 = None _nwalkers = None _burn_in = None _checkpoint_interval = None + _target_niterations = None + _target_eff_nsamples = None @abstractproperty def base_shape(self): @@ -176,6 +178,32 @@ def checkpoint_interval(self): """The number of iterations to do between checkpoints.""" return self._checkpoint_interval + @property + def target_niterations(self): + """The number of iterations the sampler should run for.""" + return self._target_niterations + + @property + def target_eff_nsamples(self): + """The target number of effective samples the sampler should get.""" + return self._target_eff_nsamples + + def set_target(self, niterations=None, eff_nsamples=None): + """Sets the target niterations/nsamples for the sampler. + + One or the other must be provided, not both. + """ + if niterations is None and eff_nsamples is None: + raise ValueError("Must provide a target niterations or " + "eff_nsamples") + if niterations is not None and eff_nsamples is not None: + raise ValueError("Must provide a target niterations or " + "eff_nsamples, not both") + self._target_niterations = int(niterations) \ + if niterations is not None else None + self._target_eff_nsamples = int(eff_nsamples) \ + if eff_nsamples is not None else None + @abstractmethod def clear_samples(self): """A method to clear samples from memory.""" @@ -268,63 +296,61 @@ def set_state_from_file(self, filename): def run(self): """Runs the sampler.""" - - if self.require_indep_samples and self.checkpoint_interval is None: + if self.target_eff_nsamples and self.checkpoint_interval is None: raise ValueError("A checkpoint interval must be set if " - "independent samples are required") + "targetting an effective number of samples") # get the starting number of samples: # "nsamples" keeps track of the number of samples we've obtained (if - # require_indep_samples is used, this is the number of independent + # target_eff_nsamples is not None, this is the effective number of # samples; otherwise, this is the total number of samples). - # "startiter" is the number of iterations that the file already + # _lastclear is the number of iterations that the file already # contains (either due to sampler burn-in, or a previous checkpoint) - try: + if self.new_checkpoint: + self._lastclear = 0 + else: with self.io(self.checkpoint_file, "r") as fp: - startiter = fp.niterations - except KeyError: - startiter = 0 - if self.require_indep_samples: + self._lastclear = fp.niterations + if self.target_eff_nsamples is not None: + target_nsamples = self.target_eff_nsamples with self.io(self.checkpoint_file, "r") as fp: - nsamples = fp.n_indep_samples - else: + nsamples = fp.effective_nsamples + elif self.target_niterations is not None: # the number of samples is the number of iterations times the # number of walkers - nsamples = startiter * self.nwalkers - # to ensure iterations are counted properly, the sampler's lastclear - # should be the same as start - self._lastclear = startiter - # keep track of the number of iterations we've done - self._itercounter = startiter + target_nsamples = self.nwalkers * self.target_niterations + nsamples = self._lastclear * self.nwalkers + else: + raise ValueError("must set either target_eff_nsamples or " + "target_niterations; see set_target") + self._itercounter = 0 # figure out the interval to use iterinterval = self.checkpoint_interval if iterinterval is None: - iterinterval = int(numpy.ceil( - float(self.target_nsamples) / self.nwalkers)) + iterinterval = self.target_niterations # run sampler until we have the desired number of samples - while nsamples < self.target_nsamples: - enditer = startiter + iterinterval + while nsamples < target_nsamples: # adjust the interval if we would go past the number of iterations - endnsamp = enditer * self.nwalkers - if endnsamp > self.target_nsamples \ - and not self.require_indep_samples: - iterinterval = int(numpy.ceil( - (endnsamp - self.target_nsamples) / self.nwalkers)) + if self.target_niterations is not None and ( + self.niterations + iterinterval > self.target_niterations): + iterinterval = self.target_niterations - self.niterations # run sampler and set initial values to None so that sampler # picks up from where it left off next call logging.info("Running sampler for {} to {} iterations".format( - startiter, enditer)) + self.niterations, self.niterations + iterinterval)) # run the underlying sampler for the desired interval self.run_mcmc(iterinterval) + # update the itercounter + #startiter = startiter + iterinterval + self._itercounter = self._itercounter + iterinterval # dump the current results self.checkpoint() # update nsamples for next loop - if self.require_indep_samples: - nsamples = self.n_indep_samples - logging.info("Have {} independent samples post burn in".format( + if self.target_eff_nsamples is not None: + nsamples = self.effective_nsamples + logging.info("Have {} effective samples post burn in".format( nsamples)) else: nsamples += iterinterval * self.nwalkers - self._itercounter = startiter = enditer @property def burn_in(self): @@ -336,8 +362,8 @@ def set_burn_in(self, burn_in): self._burn_in = burn_in @property - def n_indep_samples(self): - """The number of independent samples post burn-in that the sampler has + def effective_nsamples(self): + """The effective number of samples post burn-in that the sampler has acquired so far.""" if self.acls is None: acl = numpy.inf @@ -384,8 +410,8 @@ def checkpoint(self): if self.acls is not None: fp.write_acls(self.acls) # write the current number of iterations - fp.attrs['niterations'] = self.niterations - fp.attrs['n_indep_samples'] = self.n_indep_samples + fp.write_niterations(self.niterations) + fp.write_effective_nsamples(self.effective_nsamples) # check validity logging.info("Validating checkpoint and backup files") checkpoint_valid = validate_checkpoint_files( diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py index 1888c67..443f89d 100644 --- a/gwin/sampler/emcee.py +++ b/gwin/sampler/emcee.py @@ -101,10 +101,6 @@ def __init__(self, model, nwalkers, checkpoint_interval=None, def io(self): return self._io - def _write_more_metadata(self, fp): - """Adds nwalkers to the metadata.""" - fp.attrs['nwalkers'] = self.nwalkers - @property def base_shape(self): return (self.nwalkers,) @@ -131,13 +127,14 @@ def model_stats(self): # raw_stats has shape niterations x nwalkers x nstats; transpose # so that it has shape nwalkers x niterations x nstats raw_stats = raw_stats.transpose((1, 0, 2)) - return raw_samples_to_dict(self, raw_stats) + return raw_stats_to_dict(self, raw_stats) def clear_samples(self): """Clears the samples and stats from memory. """ # store the iteration that the clear is occuring on self._lastclear = self.niterations + self._itercounter = 0 # now clear the chain self._sampler.reset() self._sampler.clear_blobs() @@ -216,6 +213,16 @@ def from_config(cls, cp, model, nprocesses=1, use_mpi=False): obj = cls(model, nwalkers, checkpoint_interval=checkpoint_interval, logpost_function=lnpost, nprocesses=nprocesses, use_mpi=use_mpi) + # get target + if cp.has_option(section, "niterations"): + niterations = int(cp.get(section, "niterations")) + else: + niterations = None + if cp.has_option(section, "effective-nsamples"): + nsamples = int(cp.get(section, "effective-nsamples")) + else: + nsamples = None + obj.set_target(niterations=niterations, eff_nsamples=nsamples) # add burn-in if it's specified try: bit = obj.burn_in_class.from_config(cp, obj) From 9046567b5263cbc7034d88de680975f932e0a559 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Fri, 3 Aug 2018 18:30:26 +0200 Subject: [PATCH 39/47] add halfchain, posterior_step, min_iterations back to burn_in --- gwin/burn_in.py | 143 ++++++++++++++++++++++++++++++------------------ 1 file changed, 89 insertions(+), 54 deletions(-) diff --git a/gwin/burn_in.py b/gwin/burn_in.py index afd91a2..28aa8df 100644 --- a/gwin/burn_in.py +++ b/gwin/burn_in.py @@ -79,45 +79,6 @@ def ks_test(samples1, samples2, threshold=0.9): return is_the_same -def n_acl(chain, nacls=5): - """Burn in based on ACL. - - This applies the following test to determine burn in: - - 1. The first half of the chain is ignored. - - 2. An ACL is calculated from the second half. - - 3. If ``nacls`` times the ACL is < the number of iterations / 2, - the chain is considered to be burned in at the half-way point. - - Parameters - ---------- - chain : array - The chain of samples to apply the test to. Must be 1D. - nacls : int, optional - Number of ACLs to use for burn in. Default is 5. - - Returns - ------- - burn_in_idx : int - The burn in index. If the chain is not burned in, will be equal to the - length of the chain. - is_burned_in : bool - Whether or not the chain is burned in. - acl : int - The ACL that was estimated. - """ - kstart = int(len(chain)/2.) - acl = autocorrelation.calculate_acl(chain[kstart:]) - is_burned_in = nacls * acl < kstart - if is_burned_in: - burn_in_idx = kstart - else: - burn_in_idx = NOT_BURNED_IN_ITER - return burn_in_idx, is_burned_in, acl - - def max_posterior(lnps_per_walker, dim): """Burn in based on samples being within dim/2 of maximum posterior. @@ -126,7 +87,7 @@ def max_posterior(lnps_per_walker, dim): lnps_per_walker : 2D array Array of values that are proportional to the log posterior values. Must have shape ``nwalkers x niterations``. - dim : float + dim : int The dimension of the parameter space. Returns @@ -166,7 +127,7 @@ def posterior_step(logposts, dim): ---------- logposts : array 1D array of values that are proportional to the log posterior values. - dim : float + dim : int The dimension of the parameter space. Returns @@ -199,6 +160,10 @@ def posterior_step(logposts, dim): class MCMCBurnInTests(object): """Provides methods for estimating burn-in of an ensemble MCMC.""" + available_tests = ('halfchain', 'min_iterations', 'max_posterior', + 'posterior_step', 'nacl', 'ks_test', + ) + def __init__(self, sampler, burn_in_test, **kwargs): self.sampler = sampler # determine the burn-in tests that are going to be done @@ -207,21 +172,62 @@ def __init__(self, sampler, burn_in_test, **kwargs): self.burn_in_data = {t: {} for t in self.do_tests} self.is_burned_in = False self.burn_in_iteration = None - if 'nacl' in burn_in_tests: - # get the number of acls to use - self._nacls = int(kwargs.pop('nacls', 5)) - if 'ks_test' in burn_in_tests: - self._ksthreshold = float(kwargs.pop('ks_threshold', 0.9)) - - def max_posterior(self, filename): - """Applies max posterior test to self.""" - with sampler.io(filename, 'r') as fp: + # Arguments specific to each test... + # for nacl: + self._nacls = int(kwargs.pop('nacls', 5)) + # for kstest: + self._ksthreshold = float(kwargs.pop('ks_threshold', 0.9)) + # for max_posterior and posterior_step + self._ndim = int(kwargs.pop('ndim', len(sampler.variable_args))) + # for min iterations + self._min_iterations = int(kwargs.pop('min_iterations', 0)) + + def _getlogposts(self, filename): + """Convenience function for retrieving log posteriors. + + Parameters + ---------- + filename : str + The file to read. + + Returns + ------- + array + The log posterior values. They are not flattened, so have dimension + nwalkers x niterations. + """ + with self.sampler.io(filename, 'r') as fp: samples = fp.read_raw_samples( ['loglikelihood', 'logprior'], thin_start=0, thin_interval=1, flatten=False) logposts = samples['loglikelihood'] + samples['logprior'] + return logposts + + def halfchain(self, filename): + """Just uses half the chain as the burn-in iteration. + """ + with self.sampler.io(filename, 'r') as fp: + niters = fp.niterations + data = self.burn_in_data['halfchain'] + # this test cannot determine when something will burn in + # only when it was not burned in in the past + data['is_burned_in'] = True + data['burn_in_iteration'] = niters/2 + + def min_iterations(self, filename): + """Just checks that the sampler has been run for the minimum number + of iterations. + """ + with self.sampler.io(filename, 'r') as fp: + niters = fp.niterations + data = self.burn_in_data['min_iterations'] + data['is_burned_in'] = niters >= self._min_iterations + data['burn_in_iteration'] = self._min_iterations + def max_posterior(self, filename): + """Applies max posterior test to self.""" + logposts = self._getlogposts(filename) burn_in_idx, is_burned_in = burn_in.max_posterior( - logposts, len(self.variable_params)) + logposts, self._ndim) data = self.burn_in_data['max_posterior'] # required things to store data['is_burned_in'] = is_burned_in.all() @@ -230,9 +236,32 @@ def max_posterior(self, filename): data['iteration_per_walker'] = burn_in_idx data['status_per_walker'] = is_burned_in + def posterior_step(self, filename): + """Applies the posterior-step test.""" + logposts = self._getlogposts(filename) + burn_in_idx = numpy.array([posterior_step(logps, self._ndim) + for logps in logposts]) + data = self.burn_in_data['posterior_step'] + # this test cannot determine when something will burn in + # only when it was not burned in in the past + data['is_burned_in'] = True + data['burn_in_iteration'] = burn_in_idx.max() + # additional info + data['iteration_per_walker'] = burn_in_idx + def nacl(self, filename): - """Applies the nacl burn-in test""" - with sampler.io(filename, 'r') as fp: + """Burn in based on ACL. + + This applies the following test to determine burn in: + + 1. The first half of the chain is ignored. + + 2. An ACL is calculated from the second half. + + 3. If ``nacls`` times the ACL is < the number of iterations / 2, + the chain is considered to be burned in at the half-way point. + """ + with self.sampler.io(filename, 'r') as fp: niters = fp.niterations kstart = int(niters / 2.) acls = sampler.compute_acls(filename, start_index=kstart) @@ -252,7 +281,7 @@ def nacl(self, filename): def ks_test(self, filename): """Applies ks burn-in test.""" - with sampler.io(filename, 'r') as fp: + with self.sampler.io(filename, 'r') as fp: niters = fp.niterations # get the samples from the mid point samples1 = fp.read_raw_samples( @@ -319,4 +348,10 @@ def from_config(cls, cp, sampler): if cp.has_option_tag(section, 'ks-threshold', tag): kwargs['ks_threshold'] = float( cp.get_opt_tag(section, 'ks-threshold', tag)) + if cp.has_option_tag(section, 'ndim', tag): + kwargs['ndim'] = int( + cp.get_opt_tag(section, 'ndim', tag)) + if cp.has_option_tag(section, 'min-iterations', tag): + kwargs['min_iterations'] = int( + cp.get_opt_tag(section, 'min-iterations', tag)) return cls(sampler, burn_in_test, **kwargs) From 7254c84400ebb86f6f44637f0d08aac5adcf0da8 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Sat, 4 Aug 2018 14:24:58 +0200 Subject: [PATCH 40/47] fix bugs to get acl working post burn in --- gwin/burn_in.py | 47 +++++++++++++++++++++++++-------------- gwin/io/base_mcmc.py | 9 +++++--- gwin/sampler/base_mcmc.py | 47 ++++++++++++++++++++++++++------------- 3 files changed, 68 insertions(+), 35 deletions(-) diff --git a/gwin/burn_in.py b/gwin/burn_in.py index 28aa8df..ac7ad43 100644 --- a/gwin/burn_in.py +++ b/gwin/burn_in.py @@ -112,8 +112,8 @@ def max_posterior(lnps_per_walker, dim): for ii in range(nwalkers): chain = lnps_per_walker[ii, :] passedidx = numpy.where(chain >= criteria)[0] - is_burned_in[ii] = is_burned_in = passedidx.size > 0 - if is_burned_in: + is_burned_in[ii] = passedidx.size > 0 + if is_burned_in[ii]: burn_in_idx[ii] = passedidx[0] else: burn_in_idx[ii] = NOT_BURNED_IN_ITER @@ -171,17 +171,29 @@ def __init__(self, sampler, burn_in_test, **kwargs): self.burn_in_test = burn_in_test self.burn_in_data = {t: {} for t in self.do_tests} self.is_burned_in = False - self.burn_in_iteration = None + self.burn_in_iteration = NOT_BURNED_IN_ITER # Arguments specific to each test... # for nacl: self._nacls = int(kwargs.pop('nacls', 5)) # for kstest: self._ksthreshold = float(kwargs.pop('ks_threshold', 0.9)) # for max_posterior and posterior_step - self._ndim = int(kwargs.pop('ndim', len(sampler.variable_args))) + self._ndim = int(kwargs.pop('ndim', len(sampler.variable_params))) # for min iterations self._min_iterations = int(kwargs.pop('min_iterations', 0)) + def _getniters(self, filename): + """Convenience function to get the number of iterations in the file. + + If `niterations` hasn't been written to the file yet, just returns 0. + """ + with self.sampler.io(filename, 'r') as fp: + try: + niters = fp.niterations + except KeyError: + niters = 0 + return niters + def _getlogposts(self, filename): """Convenience function for retrieving log posteriors. @@ -206,8 +218,7 @@ def _getlogposts(self, filename): def halfchain(self, filename): """Just uses half the chain as the burn-in iteration. """ - with self.sampler.io(filename, 'r') as fp: - niters = fp.niterations + niters = self._getniters(filename) data = self.burn_in_data['halfchain'] # this test cannot determine when something will burn in # only when it was not burned in in the past @@ -218,20 +229,22 @@ def min_iterations(self, filename): """Just checks that the sampler has been run for the minimum number of iterations. """ - with self.sampler.io(filename, 'r') as fp: - niters = fp.niterations + niters = self._getniters(filename) data = self.burn_in_data['min_iterations'] data['is_burned_in'] = niters >= self._min_iterations data['burn_in_iteration'] = self._min_iterations + def max_posterior(self, filename): """Applies max posterior test to self.""" logposts = self._getlogposts(filename) - burn_in_idx, is_burned_in = burn_in.max_posterior( - logposts, self._ndim) + burn_in_idx, is_burned_in = max_posterior(logposts, self._ndim) data = self.burn_in_data['max_posterior'] # required things to store data['is_burned_in'] = is_burned_in.all() - data['burn_in_iteration'] = burn_in_idx.max() + if data['is_burned_in']: + data['burn_in_iteration'] = burn_in_idx.max() + else: + data['burn_in_iteration'] = NOT_BURNED_IN_ITER # additional info data['iteration_per_walker'] = burn_in_idx data['status_per_walker'] = is_burned_in @@ -261,8 +274,7 @@ def nacl(self, filename): 3. If ``nacls`` times the ACL is < the number of iterations / 2, the chain is considered to be burned in at the half-way point. """ - with self.sampler.io(filename, 'r') as fp: - niters = fp.niterations + niters = self._getniters(filename) kstart = int(niters / 2.) acls = sampler.compute_acls(filename, start_index=kstart) is_burned_in = {param: (self._nacls * acl) < kstart @@ -305,7 +317,7 @@ def ks_test(self, filename): def evaluate(self, filename): """Runs all of the burn-in tests.""" - for tst in self.tests_to_do: + for tst in self.do_tests: getattr(self, tst)(filename) # The iteration to use for burn-in depends on the logic in the burn-in # test string. For example, if the test was 'max_posterior | nacl' and @@ -319,12 +331,13 @@ def evaluate(self, filename): # by that point. Then evaluate the burn-in string at that point to see # if it passes, and if so, what the iteration is. The first point that # the test passes is used as the burn-in iteration. - burn_in_iters = numpy.unique([self.data[t]['burn_in_iteration'] + data = self.burn_in_data + burn_in_iters = numpy.unique([data[t]['burn_in_iteration'] for t in self.do_tests]) burn_in_iters.sort() for ii in burn_in_iters: - test_results = {t: (self.data[t]['is_burned_in'] & - self.data[t]['burn_in_iteration'] <= ii) + test_results = {t: (data[t]['is_burned_in'] & + 0 <= data[t]['burn_in_iteration'] <= ii) for t in self.do_tests} is_burned_in = eval(self.burn_in_test, {"__builtins__": None}, test_results) diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py index d9a401a..b306260 100644 --- a/gwin/io/base_mcmc.py +++ b/gwin/io/base_mcmc.py @@ -24,10 +24,12 @@ """Provides I/O that is specific to MCMC samplers. """ +from __future__ import absolute_import + from abc import (ABCMeta, abstractmethod) import numpy - +from .base_hdf import write_kwargs_to_hdf_attrs class MCMCIO(object): """Abstract base class that provides some IO functions for ensemble MCMCs. @@ -207,8 +209,9 @@ def write_acls(self, acls): # write the maximum over all params acl = numpy.array(acls.values()).max() self[self.sampler_group].attrs['acl'] = acl - # set the default thin interval to be the acl - self.attrs['thin_interval'] = acl + # set the default thin interval to be the acl (if it is finite) + if numpy.isfinite(acl): + self.attrs['thin_interval'] = acl def read_acls(self): """Reads the acls of all the parameters. diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index efaa16f..71f479a 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -365,15 +365,20 @@ def set_burn_in(self, burn_in): def effective_nsamples(self): """The effective number of samples post burn-in that the sampler has acquired so far.""" - if self.acls is None: - acl = numpy.inf - else: + try: acl = numpy.array(self.acls.values()).max() + except (AttributeError, TypeError): + acl = numpy.inf if self.burn_in is None: niters = self.niterations + elif not self.burn_in.is_burned_in: + nperwalker = 0 else: - niters = self.niterations - self.burn_in.burn_in_iteration - return self.nwalkers * int(niters // acl) + nperwalker = int( + (self.niterations - self.burn_in.burn_in_iteration) // acl) + # after burn in, we always have atleast 1 sample per walker + nperwalker = max(nperwalker, 1) + return self.nwalkers * nperwalker @abstractmethod def run_mcmc(self, niterations): @@ -388,20 +393,27 @@ def write_results(self, filename): def checkpoint(self): """Dumps current samples to the checkpoint file.""" # write new samples - logging.info("Writing samples to file") - self.write_results(self.checkpoint_file) - logging.info("Writing to backup file") - self.write_results(self.backup_file) + logging.info("Writing samples to files") + for fn in [self.checkpoint_file, self.backup_file]: + self.write_results(fn) + with self.io(fn, "a") as fp: + # write the current number of iterations + fp.write_niterations(self.niterations) # check for burn in, compute the acls self.acls = None if self.burn_in is not None: logging.info("Updating burn in") self.burn_in.evaluate(self.checkpoint_file) + burn_in_iter = self.burn_in.burn_in_iteration + logging.info("Is burned in: {}".format(self.burn_in.is_burned_in)) + else: + burn_in_iter = 0 # Compute acls; the burn_in test may have calculated an acl and saved # it, in which case we don't need to do it again. if self.acls is None: logging.info("Computing acls") - self.acls = self.compute_acl(self.checkpoint_file) + self.acls = self.compute_acl(self.checkpoint_file, + start_index=burn_in_iter) # write for fn in [self.checkpoint_file, self.backup_file]: with self.io(fn, "a") as fp: @@ -409,8 +421,7 @@ def checkpoint(self): fp.write_burn_in(self.burn_in) if self.acls is not None: fp.write_acls(self.acls) - # write the current number of iterations - fp.write_niterations(self.niterations) + # write effective number of samples fp.write_effective_nsamples(self.effective_nsamples) # check validity logging.info("Validating checkpoint and backup files") @@ -537,8 +548,14 @@ def compute_acl(cls, filename, start_index=None, end_index=None): param, thin_start=start_index, thin_interval=1, thin_end=end_index, flatten=False)[param] samples = samples.mean(axis=0) - acl = autocorrelation.calculate_acl(samples) - if numpy.isinf(acl): - acl = samples.size + # if < 10 samples, just set to inf + # Note: this should be done inside of pycbc's autocorrelation + # function + if samples.size < 10: + acl = numpy.inf + else: + acl = autocorrelation.calculate_acl(samples) + if acl <= 0: + acl = numpy.inf acls[param] = acl return acls From 7f0952ec919a8c181752405bf4e0ebdfeaa6f6da Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Sat, 4 Aug 2018 14:34:52 +0200 Subject: [PATCH 41/47] fix bugs in nacl burn in test --- gwin/burn_in.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gwin/burn_in.py b/gwin/burn_in.py index ac7ad43..40244c8 100644 --- a/gwin/burn_in.py +++ b/gwin/burn_in.py @@ -276,7 +276,7 @@ def nacl(self, filename): """ niters = self._getniters(filename) kstart = int(niters / 2.) - acls = sampler.compute_acls(filename, start_index=kstart) + acls = self.sampler.compute_acl(filename, start_index=kstart) is_burned_in = {param: (self._nacls * acl) < kstart for (param, acl) in acls.items()} data = self.burn_in_data['nacl'] @@ -289,7 +289,7 @@ def nacl(self, filename): # additional information data['status_per_parameter'] = is_burned_in # since we calculated it, save the acls to the sampler - sampler.acls = acls + self.sampler.acls = acls def ks_test(self, filename): """Applies ks burn-in test.""" From 67e188cdbebf1ebfe87813f60a0688307243128e Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Sat, 4 Aug 2018 14:36:14 +0200 Subject: [PATCH 42/47] write more information to the logging messages --- gwin/sampler/base_mcmc.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index 71f479a..dbc1ff8 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -406,6 +406,9 @@ def checkpoint(self): self.burn_in.evaluate(self.checkpoint_file) burn_in_iter = self.burn_in.burn_in_iteration logging.info("Is burned in: {}".format(self.burn_in.is_burned_in)) + if self.burn_in.is_burned_in: + logging.info("Burn-in iteration: {}".format( + self.burn_in.burn_in_iteration)) else: burn_in_iter = 0 # Compute acls; the burn_in test may have calculated an acl and saved @@ -414,6 +417,7 @@ def checkpoint(self): logging.info("Computing acls") self.acls = self.compute_acl(self.checkpoint_file, start_index=burn_in_iter) + logging.info("ACL: {}".format(numpy.array(self.acls.values()).max())) # write for fn in [self.checkpoint_file, self.backup_file]: with self.io(fn, "a") as fp: From a73008b0d35ae83b24049297766abc22ba439c9f Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Sat, 4 Aug 2018 14:48:49 +0200 Subject: [PATCH 43/47] fix bugs in min_iterations burn-in test --- gwin/burn_in.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/gwin/burn_in.py b/gwin/burn_in.py index 40244c8..f29e76e 100644 --- a/gwin/burn_in.py +++ b/gwin/burn_in.py @@ -74,7 +74,7 @@ def ks_test(samples1, samples2, threshold=0.9): for param in samples1: s1 = samples1[param] s2 = samples2[param] - _, p_value = ks_2samp(samples_last_iter, samples_chain_midpt) + _, p_value = ks_2samp(s1, s2) is_the_same[param] = p_value > threshold return is_the_same @@ -231,8 +231,11 @@ def min_iterations(self, filename): """ niters = self._getniters(filename) data = self.burn_in_data['min_iterations'] - data['is_burned_in'] = niters >= self._min_iterations - data['burn_in_iteration'] = self._min_iterations + data['is_burned_in'] = self._min_iterations < niters + if data['is_burned_in']: + data['burn_in_iteration'] = self._min_iterations + else: + data['burn_in_iteration'] = NOT_BURNED_IN_ITER def max_posterior(self, filename): """Applies max posterior test to self.""" @@ -304,7 +307,7 @@ def ks_test(self, filename): # do the test # is_the_same is a dictionary of params --> bool indicating whether or # not the 1D marginal is the same at the half way point - is_the_same = ks_test(samples1, samples2, threshold=self.ks_threshold) + is_the_same = ks_test(samples1, samples2, threshold=self._ksthreshold) data = self.burn_in_data['ks_test'] # required things to store data['is_burned_in'] = all(is_the_same.values()) From f6e1d5b15620a38ae2078d65bcb3c7189ae75bb4 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Sat, 4 Aug 2018 15:04:25 +0200 Subject: [PATCH 44/47] fix more bugs --- gwin/sampler/base_mcmc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index dbc1ff8..a87fa1e 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -370,14 +370,14 @@ def effective_nsamples(self): except (AttributeError, TypeError): acl = numpy.inf if self.burn_in is None: - niters = self.niterations - elif not self.burn_in.is_burned_in: - nperwalker = 0 - else: + nperwalker = max(int(self.niterations // acl), 1) + elif self.burn_in.is_burned_in: nperwalker = int( (self.niterations - self.burn_in.burn_in_iteration) // acl) # after burn in, we always have atleast 1 sample per walker nperwalker = max(nperwalker, 1) + else: + nperwalker = 0 return self.nwalkers * nperwalker @abstractmethod From a257aed9971e24e385a2041edd2d1eea047b5012 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Sat, 4 Aug 2018 15:12:26 +0200 Subject: [PATCH 45/47] fix pep8 issues --- gwin/burn_in.py | 2 +- gwin/io/__init__.py | 1 + gwin/io/base_hdf.py | 2 +- gwin/io/base_mcmc.py | 2 +- gwin/sampler/__init__.py | 11 ++++++----- gwin/sampler/base.py | 3 ++- gwin/sampler/base_mcmc.py | 5 ++--- 7 files changed, 14 insertions(+), 12 deletions(-) diff --git a/gwin/burn_in.py b/gwin/burn_in.py index f29e76e..d87bf69 100644 --- a/gwin/burn_in.py +++ b/gwin/burn_in.py @@ -196,7 +196,7 @@ def _getniters(self, filename): def _getlogposts(self, filename): """Convenience function for retrieving log posteriors. - + Parameters ---------- filename : str diff --git a/gwin/io/__init__.py b/gwin/io/__init__.py index ea519eb..c284bf6 100644 --- a/gwin/io/__init__.py +++ b/gwin/io/__init__.py @@ -31,6 +31,7 @@ EmceeFile.name: EmceeFile, } + def loadfile(path, mode=None, filetype=None, **kwargs): """Loads the given file using the appropriate InferenceFile class. diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index f52bf02..1b3961a 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -459,7 +459,7 @@ def write_command_line(self): @abstractmethod def write_resume_point(self): """Should write the point that a sampler starts up. - + How the resume point is indexed is up to the sampler. For example, MCMC samplers use the number of iterations that are stored in the checkpoint file. diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py index b306260..7e1c3b4 100644 --- a/gwin/io/base_mcmc.py +++ b/gwin/io/base_mcmc.py @@ -31,6 +31,7 @@ import numpy from .base_hdf import write_kwargs_to_hdf_attrs + class MCMCIO(object): """Abstract base class that provides some IO functions for ensemble MCMCs. """ @@ -176,7 +177,6 @@ def write_sampler_metadata(self, sampler): self[self.sampler_group].attrs['nwalkers'] = sampler.nwalkers # write the model's metadata sampler.model.write_metadata(self) - def write_acls(self, acls): """Writes the given autocorrelation lengths. diff --git a/gwin/sampler/__init__.py b/gwin/sampler/__init__.py index 5b6e435..aa7cf3a 100644 --- a/gwin/sampler/__init__.py +++ b/gwin/sampler/__init__.py @@ -21,15 +21,16 @@ from .base import (initial_dist_from_config, create_new_output_file) # from .kombine import KombineSampler -from .emcee import (EmceeEnsembleSampler, ) # EmceePTSampler) +from .emcee import EmceeEnsembleSampler +# from .emcee_pt import EmceePTSampler # from .mcmc import MCMCSampler # list of available samplers samplers = {cls.name: cls for cls in ( - #KombineSampler, + # KombineSampler, EmceeEnsembleSampler, - #EmceePTSampler, - #MCMCSampler, + # EmceePTSampler, + # MCMCSampler, )} @@ -48,7 +49,7 @@ def load_from_config(cp, model, **kwargs): \**kwargs : All other keyword arguments are passed directly to the sampler's ``from_config`` file. - + Returns ------- sampler : diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py index 86971f0..41bc2b0 100644 --- a/gwin/sampler/base.py +++ b/gwin/sampler/base.py @@ -44,6 +44,7 @@ # ============================================================================= # + class BaseSampler(object): """Base container class for inference samplers. @@ -171,7 +172,7 @@ def setup_output(self, output_file, force=False, injection_file=None): backup_file) # Create a new file if the checkpoint doesn't exist, or if it is # corrupted - self.new_checkpoint = False # keeps track if this is a new file or not + self.new_checkpoint = False # keeps track if this is a new file or not if not checkpoint_valid: logging.info("Checkpoint not found or not valid") create_new_output_file(self, checkpoint_file, force=force, diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index a87fa1e..d5afbc0 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -134,8 +134,8 @@ class BaseMCMC(object): """ __metaclass__ = ABCMeta - _lastclear = None # the iteration when samples were cleared from memory - _itercounter = None # the number of iterations since the last clear + _lastclear = None # the iteration when samples were cleared from memory + _itercounter = None # the number of iterations since the last clear _pos = None _p0 = None _nwalkers = None @@ -340,7 +340,6 @@ def run(self): # run the underlying sampler for the desired interval self.run_mcmc(iterinterval) # update the itercounter - #startiter = startiter + iterinterval self._itercounter = self._itercounter + iterinterval # dump the current results self.checkpoint() From 0a6f82dc450064ead84fd924420ad42ae57d23e2 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Sat, 4 Aug 2018 21:09:14 +0200 Subject: [PATCH 46/47] fix bugs for running with data --- gwin/io/base_hdf.py | 1 + gwin/io/base_mcmc.py | 5 ++++- gwin/models/base.py | 8 +++++++- gwin/models/base_data.py | 8 +++++++- gwin/models/gaussian_noise.py | 18 ++++++++++++++---- 5 files changed, 33 insertions(+), 7 deletions(-) diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index 1b3961a..8a1665c 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -415,6 +415,7 @@ def write_psd(self, psds, group=None): if group is None: group = subgroup else: + print group, subgroup group = '/'.join([group, subgroup]) for ifo in psds: self[group.format(ifo=ifo)] = psds[ifo] diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py index 7e1c3b4..f77247f 100644 --- a/gwin/io/base_mcmc.py +++ b/gwin/io/base_mcmc.py @@ -103,7 +103,8 @@ def write_samples(self, samples, parameters=None, istop = istart + niterations self.create_dataset(dataset_name, (nwalkers, istop), maxshape=(nwalkers, max_iterations), - dtype=float, fletcher32=True) + dtype=samples[param].dtype, + fletcher32=True) self[dataset_name][:, istart:istop] = samples[param] def read_raw_samples(self, fields, @@ -122,6 +123,8 @@ def read_raw_samples(self, fields, dict A dictionary of field name -> numpy array pairs. """ + if isinstance(fields, (str, unicode)): + fields = [fields] # walkers to load if walkers is not None: widx = numpy.zeros(fp.nwalkers, dtype=bool) diff --git a/gwin/models/base.py b/gwin/models/base.py index 9c4598c..a75c881 100644 --- a/gwin/models/base.py +++ b/gwin/models/base.py @@ -745,7 +745,13 @@ def from_config(cls, cp, **kwargs): return cls(**args) def write_metadata(self, fp): - """Writes metadata to the given file handler.""" + """Writes metadata to the given file handler. + + Parameters + ---------- + fp : gwin.io.BaseInferenceFile instance + The inference file to write to. + """ fp.attrs['model'] = self.name fp.attrs['variable_params'] = list(self.variable_params) fp.attrs['sampling_params'] = list(self.sampling_params) diff --git a/gwin/models/base_data.py b/gwin/models/base_data.py index b15327f..ebb5723 100644 --- a/gwin/models/base_data.py +++ b/gwin/models/base_data.py @@ -238,6 +238,12 @@ def from_config(cls, cp, data, delta_f=None, delta_t=None, return cls(**args) def write_metadata(self, fp): - """Adds data to the metadata that's written.""" + """Adds data to the metadata that's written. + + Parameters + ---------- + fp : gwin.io.BaseInferenceFile instance + The inference file to write to. + """ super(BaseDataModel, self).write_metadata(fp) fp.write_stilde(self.data) diff --git a/gwin/models/gaussian_noise.py b/gwin/models/gaussian_noise.py index 645dbb1..81dfb4e 100644 --- a/gwin/models/gaussian_noise.py +++ b/gwin/models/gaussian_noise.py @@ -441,12 +441,22 @@ def write_metadata(self, fp): """Adds writing the psds and lognl, since it's a constant. The lognl is written to the sample group's ``attrs``. + + Parameters + ---------- + fp : gwin.io.BaseInferenceFile instance + The inference file to write to. """ - super(GaussianNoise, self).write_data(fp) - self.attrs['f_lower'] = self._f_lower + super(GaussianNoise, self).write_metadata(fp) + fp.attrs['f_lower'] = self._f_lower if self._psds is not None: - fp.write_psd(self, self._psds) - attrs = fp[fp.samples_group].attrs + fp.write_psd(self._psds) + try: + attrs = fp[fp.samples_group].attrs + except KeyError: + # group doesn't exist, create it + fp.create_group(fp.samples_group) + attrs = fp[fp.samples_group].attrs attrs['lognl'] = self.lognl for det in self.detectors: attrs['{}_lognl'.format(det)] = self.det_lognl(det) From 370613ee36e6c55515da98f63fa85f9fb075834e Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Sat, 4 Aug 2018 21:11:52 +0200 Subject: [PATCH 47/47] whitespace --- gwin/models/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gwin/models/base.py b/gwin/models/base.py index a75c881..e15dc57 100644 --- a/gwin/models/base.py +++ b/gwin/models/base.py @@ -746,7 +746,7 @@ def from_config(cls, cp, **kwargs): def write_metadata(self, fp): """Writes metadata to the given file handler. - + Parameters ---------- fp : gwin.io.BaseInferenceFile instance