Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor: Parameter Distributions #88

Merged
merged 14 commits into from
Dec 7, 2020
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
ScikitLearn = "3646fa90-6ef7-5e7e-9f22-8aca16db6324"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
1 change: 1 addition & 0 deletions src/CalibrateEmulateSample.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ using Distributions, Statistics, LinearAlgebra, DocStringExtensions
# No internal deps, light external deps
include("Observations.jl")
include("Priors.jl")
include("ParameterDistributions.jl")

# No internal deps, heavy external deps
include("EKP.jl")
Expand Down
321 changes: 321 additions & 0 deletions src/ParameterDistributions.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
module ParameterDistributionsStorage

## Imports
using Distributions
using StatsBase
using Random

## Exports

#objects
export Parameterized, Samples
export NoConstraint, BoundedBelow, BoundedAbove, Bounded
export ParameterDistribution, ParameterDistributions

#functions
export get_name, get_distribution
export sample_distribution
export transform_real_to_prior, transform_prior_to_real

## Objects
# for the Distribution
abstract type ParameterDistributionType end

"""
Parameterized <: ParameterDistributionType

A distribution constructed from a parametrized formula (e.g Julia Distributions.jl)
"""
struct Parameterized <: ParameterDistributionType
distribution::Distribution
end

"""
Samples{FT<:Real} <: ParameterDistributionType

A distribution comprised of only samples, stored as columns of parameters
"""
struct Samples{FT<:Real} <: ParameterDistributionType
distribution_samples::Array{FT,2} #parameters are columns
Samples(distribution_samples::Array{FT,2}; params_are_columns=true) where {FT <: Real} = params_are_columns ? new{FT}(distribution_samples) : new{FT}(permutedims(distribution_samples,[2,1]))
#Distinguish 1 sample of an ND parameter or N samples of 1D parameter, and store as 2D array
Samples(distribution_samples::Array{FT,1}; params_are_columns=true) where {FT <: Real} = params_are_columns ? new{FT}(reshape(distribution_samples,1,:)) : new{FT}(reshape(distribution_samples,:,1))

end


# For the transforms
abstract type ConstraintType end

"""
NoConstraint <: ConstraintType

No constraint.
"""

struct NoConstraint <: ConstraintType end

"""
BoundedBelow{FT <: Real} <: ConstraintType

A lower bound constraint.
"""
struct BoundedBelow{FT <: Real} <: ConstraintType
lower_bound::FT
end
"""
BoundedAbove{FT <: Real} <: ConstraintType

And upper bound constraint
"""
struct BoundedAbove{FT <: Real} <: ConstraintType
upper_bound::FT
end

"""
Bounded{FT <: Real} <: ConstraintType

Both a lower and upper bound constraint.
"""
struct Bounded{FT <: Real} <: ConstraintType
lower_bound::FT
upper_bound::FT
Bounded(lower_bound::FT,upper_bound::FT) where {FT <: Real} = upper_bound <= lower_bound ? throw(DomainError("upper bound must be greater than lower bound")) : new{FT}(lower_bound, upper_bound)
end

"""
len(c::Array{CType})

The number of constraints, each constraint has length 1.
"""
function len(c::CType) where {CType <: ConstraintType}
return 1
end

function len(carray::Array{CType}) where {CType <: ConstraintType}
return size(carray)[1]
end

"""
dimension(d::ParametrizedDistributionType)

The number of dimensions of the parameter space
"""
function dimension(d::Parameterized)
return length(d.distribution)
end

function dimension(d::Samples)
return size(d.distribution_samples)[1]
end

"""
n_samples(d::Samples)

The number of samples in the array
"""
function n_samples(d::Samples)
return size(d.distribution_samples)[2]
end

"""
struct ParameterDistribution

Structure to hold a parameter distribution
"""
struct ParameterDistribution{PDType <: ParameterDistributionType, CType <: ConstraintType}
distribution::PDType
constraints::Array{CType}
name::String

ParameterDistribution(parameter_distribution::PDType,
constraints::Array{CType},
name::String) where {PDType <: ParameterDistributionType,
CType <: ConstraintType} = !(dimension(parameter_distribution) ==
len(constraints)) ? throw(DimensionMismatch("There must be one constraint per parameter")) : new{PDType,CType}(parameter_distribution, constraints, name)
end


"""
struct ParameterDistributions

Structure to hold an array of ParameterDistribution's
"""
struct ParameterDistributions
parameter_distributions::Array{ParameterDistribution}
end

odunbar marked this conversation as resolved.
Show resolved Hide resolved

## Functions

"""
get_name(pds::ParameterDistributions)

Returns a list of ParameterDistribution names
"""
function get_name(pds::ParameterDistributions)
return [get_name(pd) for pd in pds.parameter_distributions]
end

function get_name(pd::ParameterDistribution)
return pd.name
end

"""
get_distributions(pds::ParameterDistributions)

Returns a `Dict` of `ParameterDistribution` distributions by name, (unless sample type)
"""
function get_distribution(pds::ParameterDistributions)
return Dict{String,Any}(get_name(pd) => get_distribution(pd) for pd in pds.parameter_distributions)
end

function get_distribution(pd::ParameterDistribution)
return get_distribution(pd.distribution)
end

function get_distribution(d::Samples)
return "Contains samples only"
end
function get_distribution(d::Parameterized)
return d.distribution
end

"""
function sample_distribution(pds::ParameterDistributions)

Draws samples from the parameter distributions
"""
function sample_distribution(pds::ParameterDistributions)
return sample_distribution(pds,1)
end

function sample_distribution(pds::ParameterDistributions, n_draws::IT) where {IT <: Integer}
return cat([sample_distribution(pd,n_draws) for pd in pds.parameter_distributions]...,dims=1)
end

function sample_distribution(pd::ParameterDistribution)
return sample_distribution(pd,1)
end

function sample_distribution(pd::ParameterDistribution, n_draws::IT) where {IT <: Integer}
return sample_distribution(pd.distribution,n_draws)
end

function sample_distribution(d::Samples, n_draws::IT) where {IT <: Integer}
n_stored_samples = n_samples(d)
samples_idx = StatsBase.sample(collect(1:n_stored_samples), n_draws)
return d.distribution_samples[:,samples_idx]

end

function sample_distribution(d::Parameterized, n_draws::IT) where {IT <: Integer}
return rand(d.distribution, n_draws)
end


#apply transforms

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find the terms real <-> prior could be confusing. For example, I may think of the prior distribution as also being in the real space, since this is the space where my prior belief is based on. What do you think about transform_bounded_to_unbounded and transform_unbounded_to_bounded?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah i mulled over this for a while actually. The terminology came from these 2 perspectives
(i) The modelling assumes we are based in the Real space. e.g we think of the parameters are y in Real and F: Real -> Data is the forward model. The Bayesian methodology we developed works only on an unbounded prior space, and so from the mathematical stance we have a single map G: Prior -> Data. If we use a map G we will need first need a transformation T: Real -> Prior, then the action is F(y) = G(T(y)).

(ii) The other perspective is that the mathematical stance is G: Prior -> Data, and so we define a map S:Prior -> Real and then we apply the model F, so G(x) = F(S(x)).

From perspective (i) the prior is a e.g lognormal distribution and your forward map is the model, but in order to work with it you need to work in a computational space. From (ii) the prior is a normal distribution, and the forward model maps this to the data (the black box workings don't matter), I preferred the latter because the theory is based on this Prior space and not on the Real space, and this is actually how our code works.

So coming to the naming, I don't like mentioning bounded because this is only 1 of the 4 cases. But you are right the one key purpose is map to an unbounded space, i do like that; but then is it clear where your prior distribution is defined?

Copy link
Contributor

@ilopezgp ilopezgp Nov 27, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed, let's see what the others think. I think if one of the two includes unbounded or unconstrained, there should be no confusion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree the naming of transform_real_to_prior is a bit confusing. As long as it is explicitly documented in comments and docs it could be fine, since on the surface level a user would probably assume the distribution they are providing is the one in the Prior space rather than the Real space.

"""
transform_real_to_prior(pds::ParameterDistributions, x::Array{Real})

Apply the transformation to map real (and possibly constrained) parameters `xarray` into the unbounded prior space
"""
function transform_real_to_prior(pds::ParameterDistributions, xarray::Array{FT}) where {FT <: Real}
#split xarray into chunks to be fed into each distribution
ilopezgp marked this conversation as resolved.
Show resolved Hide resolved
clen = [len(pd.constraints) for pd in pds.parameter_distributions] # e.g [2,3,1,3]
cumulative_clen = [sum(clen[1:i]) for i = 1:size(clen)[1]] # e.g [1 1:2, 3:5, 6, 7:9 ]
x_idx = Dict{Integer,Array{Integer}}(i => collect(cumulative_clen[i - 1] + 1:cumulative_clen[i])
for i in 2:size(cumulative_clen)[1])
x_idx[1] = collect(1:cumulative_clen[1])

return cat([transform_real_to_prior(pd,xarray[x_idx[i]]) for (i,pd) in enumerate(pds.parameter_distributions)]...,dims=1)
end

function transform_real_to_prior(pd::ParameterDistribution, xarray::Array{FT}) where {FT <: Real}
return [transform_real_to_prior(constraint,xarray[i]) for (i,constraint) in enumerate(pd.constraints)]
end

"""
No constraint mapping x -> x
"""
function transform_real_to_prior(c::NoConstraint , x::FT) where {FT <: Real}
return x
end

"""
Bounded below -> unbounded, use mapping x -> log(x - lower_bound)
"""
function transform_real_to_prior(c::BoundedBelow, x::FT) where {FT <: Real}
return log(x - c.lower_bound)
end

"""
Bounded above -> unbounded, use mapping x -> log(upper_bound - x)
"""
function transform_real_to_prior(c::BoundedAbove, x::FT) where {FT <: Real}
return log(c.upper_bound - x)
end

"""
Bounded -> unbounded, use mapping x -> log((x - lower_bound) / (upper_bound - x)
"""
function transform_real_to_prior(c::Bounded, x::FT) where {FT <: Real}
return log( (x - c.lower_bound) / (c.upper_bound - x))
end



"""
transform_prior_to_real(pds::ParameterDistributions, xarray::Array{Real})

Apply the transformation to map parameters `xarray` from the unbounded space into (possibly constrained) real space
"""
function transform_prior_to_real(pds::ParameterDistributions,
xarray::Array{FT}) where {FT <: Real}

#split xarray into chunks to be fed into each distribution
clen = [len(pd.constraints) for pd in pds.parameter_distributions] # e.g [2,3,1,3]
cumulative_clen = [sum(clen[1:i]) for i = 1:size(clen)[1]] # e.g [1 1:2, 3:5, 6, 7:9 ]
x_idx = Dict{Integer,Array{Integer}}(i => collect(cumulative_clen[i-1]+1:cumulative_clen[i])
for i in 2:size(cumulative_clen)[1])
x_idx[1] = collect(1:cumulative_clen[1])

return cat([transform_prior_to_real(pd,xarray[x_idx[i]]) for (i,pd) in enumerate(pds.parameter_distributions)]...,dims=1)
end

function transform_prior_to_real(pd::ParameterDistribution, xarray::Array{FT}) where {FT <: Real}
return [transform_prior_to_real(constraint,xarray[i]) for (i,constraint) in enumerate(pd.constraints)]
end

"""
No constraint mapping x -> x
"""
function transform_prior_to_real(c::NoConstraint , x::FT) where {FT <: Real}
return x
end

"""
Unbounded -> bounded below, use mapping x -> exp(x) + lower_bound
"""
function transform_prior_to_real(c::BoundedBelow, x::FT) where {FT <: Real}
return exp(x) + c.lower_bound
end

"""
Unbounded -> bounded above, use mapping x -> upper_bound - exp(x)
"""
function transform_prior_to_real(c::BoundedAbove, x::FT) where {FT <: Real}
return c.upper_bound - exp(x)
end

"""
Unbounded -> bounded, use mapping x -> (upper_bound * exp(x) + lower_bound) / (exp(x) + 1)
"""
function transform_prior_to_real(c::Bounded, x::FT) where {FT <: Real}
return (c.upper_bound * exp(x) + c.lower_bound) / (exp(x) + 1)
end



end # of module
Loading