smssample.m

% The core function for the SMS sampler.
%
% inputs
%   model: the generative model that defines the posterior of interest.
%   A struct with at least the following required fields:
%   - id (string): a unique identifier.
%   - dim: the dimensionality of the parameters (theta).
%   - param: the hyper-parameters of the model (e.g., parameters in the
%     prior and likelihood).
%   - (all the following fields are function handles)
%   - prior: the prior p(theta).
%   - lklhd: the per-datum likelihood p(x|theta).
%   There are also several optional fields that, when available, are well
%   used by the code.
%   - data2stats & statlklhd: the subset likelihood is by default computed
%     by multiplying all the per-datum likelihoods. But this can be very
%     inefficient and can be accelerated when e.g. (sufficient) statistics
%     are available for the model. In such cases, we use 'data2stats' to
%     extract statistics and then call 'statlklhd' for subset likelihood
%     evaluation. The output of 'data2stats' can be fairly arbitrary as
%     long as 'statlklhd' understands it.
%   - log- & gradlog-: log- and gradient-of-log- of the operand (e.g.
%     'prior', 'lklhd', 'statlklhd', etc.)
%   - statfun: given samples of the parameter and the 'data', output a
%     fixed-dimensional statistics (of the samples) for error measuring
%     (see approxfam.errmsr below).
%   See e.g. bayeslogreg, gaussian.
%
%   data: the observed data that is assumed to be generated by the 'model'.
%   A struct array holding relevant fields of the data ('y', 'x', etc.)
%   It is important that the 'model' understands these fields so that e.g.
%   model.lklhd(data(i), theta) is a valid function call.
%   See e.g. bayeslogreg_gendata.
%
%   di: the partition scheme of the dataset.
%   A cell array each entry of which holds an array of indices into the
%   'data' so that 'di' represents a specific partition of the data points.
%   E.g. when 'data' is of length 6, 'di' can be {[1,3],[2],[4,5,6]}.
%
%   approxfam: the approximating family.
%   A struct with at least the following required fields:
%   - id (string): a unique identifier.
%   - (all the following fields are function handles)
%   - validparam: check whether the input 'param' is valid for the family.
%     During SMS iterations, this can also be used as a signal to invoke
%     aggressive damping when e.g. the "negative variance" (an invalid
%     param) issue occurs.
%     Also note that 'approxfam' does not necessarily need to work with
%     "natural parameters". The 'param' is valid as long as the following
%     functions understand it in a consistent way.
%   - mul: multiply the two input distributions.
%   - div: divide the 1st input distribution with the 2nd.
%   - mulall: multiply all the input distributions.
%   - damp: perform damping. 'dampalpha'==0 means no damping. 'dampalpha'
%     can be other than scalar (e.g. dampalpha.a = 0.5, dampalpha.b = 0.3
%     if param-specific damping is desired) as long as the function
%     understands it.
%   - emptyparam: output 'param' s.t. mul(param, nparam) = nparam.
%   - priorparam: output 'param' that matches the prior of the model 
%     (raised to the power of '1/m') and a boolean scalar 'exact' to imply
%     whether the prior belongs to the current approximating family. Note
%     that the output 'param' is merely used for initialization purposes
%     and hence does not necessarily need to be "exact".
%   - diverge: compute the divergence between the two input distributions.
%   - smpls2param: compute 'param' from the samples via "moment matching".
%     A default 'param' can be passed through the 2nd argument if
%     necessary.
%   - param2smplm: compute sample mean from the 'param'.
%   - errmsr: measure the errors of the samples & 'param' with the ground
%     truth samples.
%   See e.g. approxfam_gaussian.
%
%   async: a boolean scalar, true for "asynchronous" / false for
%   "synchronous".
%
%   partprior: a boolean scalar, true to "partition the prior" / false to
%   "retain the full prior as an individual factor".
%
%   dampalpha: a double scalar, 0 means "no damping" (use the new value);
%   1 means "complete damping" (use the old value). It can be other than
%   scalar if necessary (see comments above for 'approxfam').
%
%   sampler: the local sampler.
%   A struct with a function handle field 'draw' that outputs samples given
%   the following inputs:
%   - approxfam: the 'approxfam'.
%   - qxparam: the 'param' of the context prior.
%   - factor: a struct representing the local factor.
%     'factor.data' stores the local subset or, when 'model.data2stats'
%     is properly defined, the necessary statistics of the data that can be
%     useful for the sampler.
%     'factor.f' evaluates the factor function at the input sample.
%     Optionally, 'factor.logf' & 'factor.gradlogf' might also be available
%     when 'model' has its corresponding 'log-' & 'gradlog-' fields
%     properly defined.
%   - nsamples: number of samples to be drawn.
%   - smpl0: the initial sample (a row vector).
%   Note that how the 'approxfam' interprets the 'qxparam' to give out
%   necessary information needed for sampling is totally up to the user.
%   The returned samples shall be stored in rows.
%   See e.g. sampler_nuts_da.
%
%   tsmpls: the ground truth samples.
%   A matrix where samples are stored in rows. It is used only for
%   monitoring errors during the iterations and does not affect the
%   execution logic of the algorithm.
%
%   T (optional): an integer scalar that controls the number of effective
%   samples to draw for each local sampling step.
%
%   maxiter (optional): maximum number of iterations (10 by default).
%
% outputs
%   qparam: the parameters of the approximate posterior at termination.
%
%   smplcell: a (1 x m) cell array each entry of which holds a matrix of
%   the samples (stored in rows) from the nodes at termination.
%
%   divs: an array recording divergences between the two approximate
%   posteriors from successive iterations.
%
%   errs: a struct array holding the measured errors during iterations.
%
%   lsm: a 3D array of size d x (m+1) x niters, holding local sample means
%   during iterations. The "+1" dimension corresponds to the mean of the
%   approximate posterior.
%
% Minjie Xu (chokkyvista06@gmail.com)

function [qparam, smplcell, divs, errs, lsm] = smssample(model, data, di, ...
    approxfam, async, partprior, dampalpha, sampler, tsmpls, T, maxiter)
%% configurations
d = model.dim;
m = numel(di);

if ~exist('T', 'var')
    nsmpls = @(n)max(ceil(1000*d/sqrt(n)), 10000);
else
    nsmpls = @(n)T;
end
if ~exist('maxiter', 'var')
    maxiter = 10;
end

epsilon = 0.5*d; % threshold for convergence control
ncnvg = 5; % number of successive "converging" iterations to wait before termination

%% reformulate as factors
if partprior
    % pprior holds part of the prior that is assigned to each subdata node
    pprior = genpprior_part(model, m);
else
    % when the prior is retained wholly in its own node, pprior becomes a
    % dummy placeholder
    pprior = genpprior_whole(d);
    % define the factor that is associated with the prior
    pfactor = struct('f', model.prior, 'data', []);
    if isfield(model, 'logprior')
        pfactor.logf = model.logprior;
    end
    if isfield(model, 'gradlogprior')
        pfactor.gradlogf = model.gradlogprior;
    end
end

subdata = cellfun(@(ii)data(ii), di, 'UniformOutput', false);
if isfield(model, 'data2stats')
    ss = cellfun(@(sdi)model.data2stats(sdi), subdata, 'UniformOutput', false);
    factors = cellfun(@(ssi)genfactor_statlklhd(model, pprior, ssi), ss);
    clear ss;
else
    factors = cellfun(@(sdi)genfactor_lklhd(model, pprior, sdi), subdata);
end
clear subdata;

%% Initialize messages
% initialize params of the local approximating factors
% q0param: the factor associated with the prior
% qiparams: the m factors associated with the data subsets
if partprior
    q0param = approxfam.emptyparam(model);
    qiparams = repmat(approxfam.priorparam(model, m), 1, m);
else
    [q0param, exactpf] = approxfam.priorparam(model, 1);
    qiparams = repmat(approxfam.emptyparam(model), 1, m);
end
assert(approxfam.validparam(q0param));
assert(all(arrayfun(@(param)approxfam.validparam(param), qiparams)));
% compute params of the global posterior
qparam = approxfam.mulall([q0param, qiparams]);
% compute params of the context priors
qx0param = approxfam.div(qparam, q0param);
qxiparams = arrayfun(@(qiparam)approxfam.div(qparam, qiparam), qiparams, 'UniformOutput', false);
qxiparams = cell2mat(qxiparams);

%% EP iterations
fprintf('--- m = %d ---\n', m);
disp('setting up cluster');
% set up the parallel cluster
nsamples = arrayfun(@(dii)nsmpls(numel(dii)), di);
if async % asynchronous
    if verLessThan('distcomp', '6.3')
        jobs = [];
        schd = findResource();
        job = createJob(schd, 'FileDependencies', {mfilename}); % append the cell array when necessary
        jobs = [jobs, job];
        for i = 1:m
            F(i) = createTask(job, @epiter, nargout(@epiter), ...
                {approxfam, qxiparams(i), qiparams(i), factors(i), ...
                sampler, nsamples(i), dampalpha});
        end
        submit(job);
    else
        for i = 1:m
            F(i) = parfeval(@epiter, nargout(@epiter), ...
                approxfam, qxiparams(i), qiparams(i), factors(i), ...
                sampler, nsamples(i), dampalpha);
        end
    end
    jids = zeros(1, m); % every m finished jobs' (nodes') IDs
else % synchronous
    if ~verLessThan('distcomp', '6.4')
        % In newer versions, Matlab will spontaneously start a new parallel
        % cluster (with default profile) for 'parfor' if necessary.
    else
        schd = findResource();
        jobs = findJob(schd);
        if matlabpool('size') == 0 || isempty(jobs)
            matlabpool(schd, 'FileDependencies', {mfilename});
        else
            nfldpnd = {mfilename};
            fldpnd = get(jobs(1), 'FileDependencies');
            for i = 1:numel(nfldpnd)
                if isempty(cell2mat(strfind(fldpnd, nfldpnd{i})))
                    matlabpool('addfiledependencies', nfldpnd(i));
                end
            end
            matlabpool('updatefiledependencies');
        end
    end
end

% initialize the outputs
dummysample = approxfam.param2smplm(qparam);
smplcell = repmat({dummysample}, 1, m);
divs = nan*ones(1, maxiter);
errs = [];
if nargout > 4
    lsm = zeros(d, m+1, maxiter);
end

tcnvg = 0;
bqparam = qparam;

if isfield(model, 'statfun')
    tstats = model.statfun(tsmpls, data);
end

iter = 0; % the current total number of iterations (aggregated from all nodes)
liter = m;
tStart = tic;
% the main loop
disp('entering main loop');
while tcnvg < ncnvg && iter < maxiter*m
    if async % asynchronous
        if verLessThan('distcomp', '6.3')
            pending = strcmp(get(F, 'State'), 'finished');
            while ~any(pending) % all jobs are running
                pause(0.2);
                pending = strcmp(get(F, 'State'), 'finished');
            end
            for i = 1:m
                if pending(i) % finished but output yet to be fetched
                    outputs = get(F(i), 'OutputArguments');
                    [qlparam, smpls] = deal(outputs{:});
                    jids(mod(iter,m)+1) = i;
                    % compute the latest context for node i
                    qxparam = approxfam.div(qparam, qiparams(i));
                    % compute qparam with the latest update from node i
                    qparam = approxfam.mul(qxparam, qlparam);
                    qiparams(i) = qlparam;
                    smplcell{i} = smpls;
                    iter = iter + 1;
                end
            end
            
            nssamples = sum(nsamples(pending));
            npsamples = mean(nsamples(pending));
            
            job = createJob(schd, 'FileDependencies', {mfilename});
            jobs = [jobs, job];
            for i = 1:m
                if pending(i)
                    [qxiparams(i)] = update_context(approxfam, qparam, qiparams(i), qxiparams(i));
                    destroy(F(i));
                    F(i) = createTask(job, @epiter, nargout(@epiter), ...
                        {approxfam, qxiparams(i), qiparams(i), factors(i), ...
                        sampler, nsamples(i), dampalpha});
                end
            end
            submit(job);
        else
            [ji, qlparam, smpls] = fetchNext(F);
            jids(mod(iter,m)+1) = ji;
            qxparam = approxfam.div(qparam, qiparams(ji));
            qparam = approxfam.mul(qxparam, qlparam);
            qiparams(ji) = qlparam;
            smplcell{ji} = smpls;
            iter = iter + 1;
            
            nssamples = nsamples(ji);
            npsamples = nsamples(ji);
            
            [qxiparams(ji)] = update_context(approxfam, qparam, qiparams(ji), qxiparams(ji));
            F(ji) = parfeval(@epiter, nargout(@epiter), ...
                approxfam, qxiparams(ji), qiparams(ji), factors(ji), ...
                sampler, nsamples(ji), dampalpha);
        end
    else % synchronous
        parfor i = 1:m
            [qiparams(i), smpls] = epiter(approxfam, qxiparams(i), qiparams(i), factors(i), ...
                sampler, nsamples(i), dampalpha);
            smplcell{i} = smpls;
        end
        iter = iter + m;
        qparam = approxfam.mulall([q0param, qiparams]);
        
        nssamples = sum(nsamples);
        npsamples = mean(nsamples);
        
        for i = 1:m
            [qxiparams(i)] = update_context(approxfam, qparam, qiparams(i), qxiparams(i));
        end
    end
    
    % update the prior factor once every other m factor updates when
    % necessary
    if ~partprior && ~exactpf && iter >= liter
        qx0param = update_context(approxfam, qparam, q0param, qx0param);
        q0param = epiter(approxfam, qx0param, q0param, pfactor, sampler, nsmpls(1), dampalpha);
        qparam = approxfam.mulall([q0param, qiparams]);
        for i = 1:m
            [qxiparams(i)] = update_context(approxfam, qparam, qiparams(i), qxiparams(i));
        end
    end
    
    % compute the errors
    allsmpls = cell2mat(smplcell');
    err = approxfam.errmsr(tsmpls, allsmpls, qparam);
    if isfield(model, 'statfun')
        err.stats_mse = mean((tstats - model.statfun(allsmpls, data)).^2);
    end
    err.nssamples = nssamples;
    err.npsamples = npsamples;
    errs = [errs, err];
    
    % every complete round of updates of all the factors
    if iter >= liter
        tElapsed = toc(tStart);
        
        niters = floor(iter/m);
        liter = (niters+1) * m;
        div = approxfam.diverge(bqparam, qparam);
        divs(niters) = div;
        bqparam = qparam;
        
        if nargout > 4
            for i = 1:m
                lsm(:, i, niters) = mean(smplcell{i})';
            end
            lsm(:, end, niters) = approxfam.param2smplm(qparam);
        end
        
        if div >= 0 && div < epsilon
            tcnvg = tcnvg + 1;
        else
            tcnvg = 0;
        end
        
        if async
            fprintf('(%s\b)\n', num2str(jids, '%d, '));
        end
        if nargout > 4
            fprintf('mean = (%s)\n', ...
                strjoin(cellstr(num2str(lsm(:, end, niters), '%.3f'))', ', '));
        end
        fprintf('[%d] div = %.3f, t = %.3fs\n\n', niters, div, tElapsed);
        
        tStart = tic;
    end
end
divs(niters+1:end) = [];
if nargout > 4
    lsm(:,:,niters+1:end) = [];
end

if async
    if verLessThan('distcomp', '6.3')
        arrayfun(@(f)destroy(f), F);
        arrayfun(@(j)destroy(j), jobs);
        clear F jobs;
    else
        cancel(F);
    end
end
end


function [qxparam] = update_context(approxfam, qparam, qlparam, oqxparam)
nqxparam = approxfam.div(qparam, qlparam);
qxparam = nqxparam;

talpha = linspace(0, 1, 11);
j = 1;
isvalid = approxfam.validparam(qxparam);
while ~isvalid
    j = j + 1;
    qxparam = approxfam.damp(nqxparam, oqxparam, talpha(j));
    isvalid = approxfam.validparam(qxparam);
end
end


function [qlparam, smpls] = epiter(approxfam, qxparam, qlparam, factor, ...
    sampler, nsamples, dampalpha)
assert(nsamples >= 1);
smpl0 = approxfam.param2smplm(qxparam);
smpls = sampler.draw(approxfam, qxparam, factor, nsamples, smpl0);
qparam = approxfam.smpls2param(smpls, approxfam.mul(qxparam, qlparam));
nqlparam = approxfam.div(qparam, qxparam);
qlparam = approxfam.damp(nqlparam, qlparam, dampalpha);
end


function pprior = genpprior_part(model, m)
pprior.p = @(var)model.prior(var).^(1/m);
if isfield(model, 'logprior')
    pprior.logp = @(var)model.logprior(var)./m;
end
if isfield(model, 'gradlogprior')
    pprior.gradlogp = @(var)model.gradlogprior(var)./m;
end
end


function pprior = genpprior_whole(d)
zz = zeros(d, 1);
pprior = struct('p', @(var)1, 'logp', @(var)0, 'gradlogp', @(var)zz);
end


function factor = genfactor_statlklhd(model, pprior, ssi)
if isfield(model, 'statlklhd')
    factor.f = @(w)model.statlklhd(ssi, w)*pprior.p(w);
end
if isfield(model, 'logstatlklhd')
    factor.logf = @(w)model.logstatlklhd(ssi, w)+pprior.logp(w);
end
if isfield(model, 'gradlogstatlklhd')
    factor.gradlogf = @(w)model.gradlogstatlklhd(ssi, w)+pprior.gradlogp(w);
end
factor.data = ssi;
end


function factor = genfactor_lklhd(model, pprior, subdatai)
if isfield(model, 'lklhd')
    factor.f = @(w)prod(arrayfun(@(sdii)model.lklhd(sdii,w), subdatai))*pprior.p(w);
end
if isfield(model, 'loglklhd')
    factor.logf = @(w)sum(arrayfun(@(sdii)model.loglklhd(sdii,w), subdatai)) ...
        + pprior.logp(w);
end
if isfield(model, 'gradloglklhd')
    factor.gradlogf = @(w)sum(cell2mat(arrayfun(...
        @(sdii)model.gradloglklhd(sdii,w), subdatai, 'UniformOutput', false)), 2) ...
        + pprior.gradlogp(w);
end
factor.data = subdatai;
end